From 0292abbef10bac5e0a45fd7dea3460aa2badef48 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Mon, 9 Sep 2024 15:40:46 +0200 Subject: [PATCH] Externalise Runner Manager (#358) * Spawn a manual test env * Disable spawning on manual test env * Remove useless class * Fix runner deletion * Fix import error * Add more docs * Fix get no-existing openstack server * Add debug statement * Fix variable name and function name mixup * Fix id variable name, function name mixup * Add debug statement * Move debug * Add busy runner test * Add debug statement. * Disable some test * Disable some test * Fix runner label in workflow * Fix lambda * Debug * Debug * Add debug * Start new manual test env * Add none check * Fix missing prefix * Add more logging * Refactor runner manager one runner fixture * Fix error string formatting * Adding the docstring for github_runner_manager * Fix test fixture scope * Add docstring on cloud_runner_manager * Add debug * Fix docstring for cloud runner manager * Add more docstrings * Add metrics for deleted and cleanup runners * Enable tests again * Add debug * Get runner info not on GitHub * Fix dict access * Add debug of userdata * Fix metric path * Debug metric * Fix variable naming * Test * Fix iterator * Debug * Debug * Fix for iterator return value * Add more log path patching * Fix path naming * Fix monkey patch * Start a arm64 manual test env * Not spawning manual test env * Update fmt * Fix metric storage implementation for openstack * Fix metric storage provider usage in openstack runner manager * Debug * Fix iterator * Add debug * Fix None in iterator * Add debug * Trying fix for get runner filter * Add test * Patch the path for logs * Add cleanup test * Debug * Fix github state determining busy runner * Fix wrong naming for method in ABC * Remove debugging * Add more docstrings * Fix runner deletion * Add more docs * Fix typing * Debug * Update SSH health check * Tmp disable a passing test * Add deubg * Remove a debug * Fix Cloud runner state init * Change clean up to cleanup * Fix attr naming issue in openstack cloud * Fix reference to non-existing instance_name in openstack cloud * Add metric log processing to test * Enable all tests * Fix health check return value * Fix all flake8 lints * Fix test * Fix all lints * Fix unit test issue due to method sig change * Ignore openstack cloud from coverage due to the test requires private endpoint * Enable all tests * Remove a repeated test * Re-enable test.yaml * Fix integration tests workflwo * Add docs on cleanup method of cloud runner manager * Add parallel spawning of runners. * Enable dev testing * Fix parallel spawn * Allow openstack server to take a bit of time on deletion * Refactor test detection of no runners * Re-enable the tests * Fix lints * Disable tests again * Disable some test * Add wait until runner is running * Enable openstack runner manager tests * Add debug * Wait for github state * Refactor wait until runner spawn * Add keyfile erorr * Remove debug statement * Re-enable all tests * Update src/manager/github_runner_manager.py Co-authored-by: Yanks Yoon <37652070+yanksyoon@users.noreply.github.com> * Update src/openstack_cloud/openstack_cloud.py Co-authored-by: Yanks Yoon <37652070+yanksyoon@users.noreply.github.com> * Suggestions * Refactor remove openstack server * Test spawning two runners. * Fix test * Fix naming * Fix according comment * Fix clouds yaml write issue. * Fix format * Add delete runner by amount * Add getting runner health state for metrics * Fix security group ID issues * Fix according to review * Refactor health state for runner * Fix lint issues * Add missing docs * Update the github state enum to use auto * Rename class to fit convension * Fix according to review * Fix name_prefix property cloud runner manager * Add class for scaling runners * Fix lints * Fix unit test * Fix according to review comment * Fix test according to comments * Fix unit test * Fix typo of attr * Add debug * Add debug statement * Debug * Fix return code of the kill command * Remove debug * Add comments on the flush kill command * Add debug * Fix debug * Debug * Debug * Remove debug * Add cleanup during idle and busy runner test * Debug * Disable tests during debug * Debug missing keyfiles * Fix keyfile path matching issue * testing * debug * Add debug * Use OR * debug * Debug * Debug * Debug * Debug * Debug * Fix flush mode * Remove debug * Re-enable all tests * Initial unit test for runner scaler * Add more unit tests for runner scaler * Add more tests * Fix merge issues * Fix states in get_runners methods * Add docstring for unit test mocks * Fix construction of repo-policy-compliance from config * Fix get_runners action output * Fix the lints * Fix a naming issue * Fix naming prefix of runner * Improve unit test * Remove the old OpenstackRunnerManager * Fix test contstruction of runner manager. * Fix flavor naming * Fix flush action result output. * Fix flavor of metric * Testing out a integration test fikx * change flush runner to flush idle. * Add debug in integration test * Manual test mode * Start new manual test env * Spawn x64 manual test env. * Improve logging during reconcile * Fix crashed metric collection * Remove debug workflow * Format * Test * externalise * fix tests * Add reactive back in * Fix flushing of runners * Debug workflow * Add debug * Fix logging of health state * Remove debug * Debug * Fix set contruction * Fix SSH key path in integration test setup * Add more checks to repo-policy-compliance setup in tests * Fix key path check * Fix format string issue * Fix format string typo * fix integration test import * outcomment externalised workflow * Add some logging of test setup * Fix missing await * Revert config-change flushing * Add maintance status for image relation change * Fix HTTP format * Revert "outcomment externalised workflow" This reverts commit e0a78af914a097814b3a4c1df013da5e40bf9ac8. * re-checkin integration test * use github types from externalised app * use github types from externalised app * Update coverage ignore of github_runner_manager * Minor fix in test comments * lint and fix unit tests * fix merge * remove reactive script * fix merge * remove unused OpenstackUnauthorizedError * final new line * remove code duplication * remove duplicate src-docs * remove openstack-userdata.sh.j2 * pin commit in github-runner-manager --------- Co-authored-by: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Co-authored-by: Yanks Yoon <37652070+yanksyoon@users.noreply.github.com> --- pyproject.toml | 2 +- requirements.txt | 1 + scripts/reactive_runner.py | 50 -- src-docs/charm.md | 10 +- .../{charm_state.py.md => charm_state.md} | 589 +++++++------ src-docs/errors.md | 232 +---- src-docs/github_client.md | 169 +--- src-docs/runner.md | 16 +- src-docs/runner_manager.md | 6 +- src-docs/runner_type.md | 10 +- src-docs/shared_fs.md | 8 +- src-docs/utilities.md | 100 +-- src/charm.py | 33 +- src/charm_state.py | 80 +- src/errors.py | 94 +- src/github_client.py | 250 +----- src/github_type.py | 164 ---- src/logrotate.py | 4 +- src/manager/cloud_runner_manager.py | 203 ----- src/manager/github_runner_manager.py | 133 --- src/manager/runner_manager.py | 364 -------- src/manager/runner_scaler.py | 215 ----- src/metrics/__init__.py | 4 - src/metrics/events.py | 167 ---- src/metrics/github.py | 53 -- src/metrics/runner.py | 470 ---------- src/metrics/runner_logs.py | 54 -- src/metrics/storage.py | 192 ---- src/metrics/type.py | 23 - src/openstack_cloud/__init__.py | 78 -- src/openstack_cloud/openstack_cloud.py | 597 ------------- .../openstack_runner_manager.py | 830 ------------------ src/reactive/__init__.py | 4 - src/reactive/consumer.py | 112 --- src/reactive/runner_manager.py | 141 --- src/repo_policy_compliance_client.py | 73 -- src/runner.py | 7 +- src/runner_manager.py | 16 +- src/runner_manager_type.py | 6 +- src/runner_type.py | 4 +- src/shared_fs.py | 3 +- src/utilities.py | 149 +--- templates/openstack-userdata.sh.j2 | 105 --- tests/integration/helpers/charm_metrics.py | 6 +- tests/integration/helpers/openstack.py | 2 +- .../integration/test_charm_metrics_failure.py | 4 +- .../integration/test_charm_metrics_success.py | 2 +- tests/integration/test_reactive.py | 4 +- .../test_runner_manager_openstack.py | 31 +- tests/integration/test_self_hosted_runner.py | 2 +- tests/unit/conftest.py | 12 +- tests/unit/metrics/__init__.py | 2 - tests/unit/metrics/test_events.py | 57 -- tests/unit/metrics/test_github.py | 70 -- tests/unit/metrics/test_runner.py | 649 -------------- tests/unit/metrics/test_runner_logs.py | 31 - tests/unit/metrics/test_storage.py | 168 ---- tests/unit/mock.py | 3 +- tests/unit/mock_runner_managers.py | 13 +- tests/unit/reactive/__init__.py | 2 - tests/unit/reactive/test_consumer.py | 89 -- tests/unit/reactive/test_runner_manager.py | 175 ---- tests/unit/test_charm.py | 6 +- tests/unit/test_charm_state.py | 21 +- tests/unit/test_github_client.py | 208 ----- tests/unit/test_lxd_runner_manager.py | 21 +- tests/unit/test_openstack_cloud.py | 41 - tests/unit/test_runner.py | 13 +- tests/unit/test_runner_scaler.py | 25 +- tests/unit/test_shared_fs.py | 2 +- 70 files changed, 528 insertions(+), 6952 deletions(-) delete mode 100644 scripts/reactive_runner.py rename src-docs/{charm_state.py.md => charm_state.md} (71%) delete mode 100644 src/github_type.py delete mode 100644 src/manager/cloud_runner_manager.py delete mode 100644 src/manager/github_runner_manager.py delete mode 100644 src/manager/runner_manager.py delete mode 100644 src/manager/runner_scaler.py delete mode 100644 src/metrics/__init__.py delete mode 100644 src/metrics/events.py delete mode 100644 src/metrics/github.py delete mode 100644 src/metrics/runner.py delete mode 100644 src/metrics/runner_logs.py delete mode 100644 src/metrics/storage.py delete mode 100644 src/metrics/type.py delete mode 100644 src/openstack_cloud/__init__.py delete mode 100644 src/openstack_cloud/openstack_cloud.py delete mode 100644 src/openstack_cloud/openstack_runner_manager.py delete mode 100644 src/reactive/__init__.py delete mode 100644 src/reactive/consumer.py delete mode 100644 src/reactive/runner_manager.py delete mode 100644 src/repo_policy_compliance_client.py delete mode 100644 templates/openstack-userdata.sh.j2 delete mode 100644 tests/unit/metrics/__init__.py delete mode 100644 tests/unit/metrics/test_events.py delete mode 100644 tests/unit/metrics/test_github.py delete mode 100644 tests/unit/metrics/test_runner.py delete mode 100644 tests/unit/metrics/test_runner_logs.py delete mode 100644 tests/unit/metrics/test_storage.py delete mode 100644 tests/unit/reactive/__init__.py delete mode 100644 tests/unit/reactive/test_consumer.py delete mode 100644 tests/unit/reactive/test_runner_manager.py delete mode 100644 tests/unit/test_github_client.py delete mode 100644 tests/unit/test_openstack_cloud.py diff --git a/pyproject.toml b/pyproject.toml index d16bac3a9..f4a49bd2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ omit = [ ] [tool.coverage.report] -fail_under = 85 +fail_under = 83 show_missing = true [tool.pytest.ini_options] diff --git a/requirements.txt b/requirements.txt index 541c0d4c9..4d219d184 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ PyYAML ==6.0.* pyOpenSSL==24.2.1 kombu==5.4.0 pymongo==4.8.0 +github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@1f310b22b99a94bd5429184191558426b014ee82 diff --git a/scripts/reactive_runner.py b/scripts/reactive_runner.py deleted file mode 100644 index e9b996ff6..000000000 --- a/scripts/reactive_runner.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Script to spawn a reactive runner process.""" -import logging -import os -import sys - -from reactive.consumer import consume -from reactive.runner_manager import MQ_URI_ENV_VAR, QUEUE_NAME_ENV_VAR - - -def setup_root_logging() -> None: - """Set up logging for the reactive runner process.""" - # setup root logger to log in a file which will be picked up by grafana agent and sent to Loki - logging.basicConfig( - stream=sys.stdout, - level=logging.DEBUG, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - ) - - -def main() -> None: - """Spawn a process that consumes a message from the queue to create a runner. - - Raises: - ValueError: If the required environment variables are not set - """ - mq_uri = os.environ.get(MQ_URI_ENV_VAR) - queue_name = os.environ.get(QUEUE_NAME_ENV_VAR) - - if not mq_uri: - raise ValueError( - f"Missing {MQ_URI_ENV_VAR} environment variable. " - "Please set it to the message queue URI." - ) - - if not queue_name: - raise ValueError( - f"Missing {QUEUE_NAME_ENV_VAR} environment variable. " - "Please set it to the name of the queue." - ) - - setup_root_logging() - consume(mq_uri, queue_name) - - -if __name__ == "__main__": - main() diff --git a/src-docs/charm.md b/src-docs/charm.md index 9fd2aac04..fa38de542 100644 --- a/src-docs/charm.md +++ b/src-docs/charm.md @@ -20,7 +20,7 @@ Charm for creating and managing GitHub self-hosted runner instances. --- - + ## function `catch_charm_errors` @@ -46,7 +46,7 @@ Catch common errors in charm. --- - + ## function `catch_action_errors` @@ -72,7 +72,7 @@ Catch common errors in actions. --- - + ## class `ReconcileRunnersEvent` Event representing a periodic check to ensure runners are ok. @@ -83,7 +83,7 @@ Event representing a periodic check to ensure runners are ok. --- - + ## class `GithubRunnerCharm` Charm for managing GitHub self-hosted runners. @@ -100,7 +100,7 @@ Charm for managing GitHub self-hosted runners. - `ram_pool_path`: The path to memdisk storage. - `kernel_module_path`: The path to kernel modules. - + ### method `__init__` diff --git a/src-docs/charm_state.py.md b/src-docs/charm_state.md similarity index 71% rename from src-docs/charm_state.py.md rename to src-docs/charm_state.md index 5783f6821..9b4889d5e 100644 --- a/src-docs/charm_state.py.md +++ b/src-docs/charm_state.md @@ -2,7 +2,7 @@ -# module `charm_state.py` +# module `charm_state` State of the Charm. **Global Variables** @@ -33,49 +33,100 @@ State of the Charm. - **COS_AGENT_INTEGRATION_NAME** - **DEBUG_SSH_INTEGRATION_NAME** - **IMAGE_INTEGRATION_NAME** +- **MONGO_DB_INTEGRATION_NAME** - **LTS_IMAGE_VERSION_TAG_MAP** + +--- + + + +## class `AnyHttpsUrl` +Represents an HTTPS URL. + + + +**Attributes:** + + - `allowed_schemes`: Allowed schemes for the URL. + + + + + --- - + + +## class `GithubConfig` +Charm configuration related to GitHub. -## function `parse_github_path` + + +**Attributes:** + + - `token`: The Github API access token (PAT). + - `path`: The Github org/repo path. + + + +### method `__init__` ```python -parse_github_path(path_str: str, runner_group: str) → GithubOrg | GithubRepo +__init__(token: str, path: GitHubOrg | GitHubRepo) → None ``` -Parse GitHub path. + + + + + + + +--- + + + +### classmethod `from_charm` + +```python +from_charm(charm: CharmBase) → GithubConfig +``` + +Get github related charm configuration values from charm. **Args:** - - `path_str`: GitHub path in string format. - - `runner_group`: Runner group name for GitHub organization. If the path is a repository this argument is ignored. + - `charm`: The charm instance. **Raises:** - - `CharmConfigInvalidError`: if an invalid path string was given. + - `CharmConfigInvalidError`: If an invalid configuration value was set. **Returns:** - GithubPath object representing the GitHub repository, or the GitHub organization with runner group information. + The parsed GitHub configuration values. --- -## class `AnyHttpsUrl` -Represents an HTTPS URL. + + +## class `VirtualMachineResources` +Virtual machine resource configuration. **Attributes:** - - `allowed_schemes`: Allowed schemes for the URL. + - `cpu`: Number of vCPU for the virtual machine. + - `memory`: Amount of memory for the virtual machine. + - `disk`: Amount of disk for the virtual machine. @@ -83,6 +134,8 @@ Represents an HTTPS URL. --- + + ## class `Arch` Supported system architectures. @@ -99,15 +152,17 @@ Supported system architectures. --- -## class `BaseImage` -The ubuntu OS base image to build and deploy runners on. + + +## class `RunnerStorage` +Supported storage as runner disk. **Attributes:** - - `JAMMY`: The jammy ubuntu LTS image. - - `NOBLE`: The noble ubuntu LTS image. + - `JUJU_STORAGE`: Represents runner storage from Juju storage. + - `MEMORY`: Represents tempfs storage (ramdisk). @@ -115,64 +170,80 @@ The ubuntu OS base image to build and deploy runners on. --- -## class `CharmConfig` -General charm configuration. + -Some charm configurations are grouped into other configuration models. +## class `InstanceType` +Type of instance for runner. **Attributes:** - - `denylist`: List of IPv4 to block the runners from accessing. - - `dockerhub_mirror`: Private docker registry as dockerhub mirror for the runners to use. - - `labels`: Additional runner labels to append to default (i.e. os, flavor, architecture). - - `openstack_clouds_yaml`: The openstack clouds.yaml configuration. - - `path`: GitHub repository path in the format '/', or the GitHub organization name. - - `reconcile_interval`: Time between each reconciliation of runners in minutes. - - `repo_policy_compliance`: Configuration for the repo policy compliance service. - - `token`: GitHub personal access token for GitHub API. + - `LOCAL_LXD`: LXD instance on the local juju machine. + - `OPENSTACK`: OpenStack instance on a cloud. + --- - + + +## class `CharmConfigInvalidError` +Raised when charm config is invalid. + -### classmethod `check_reconcile_interval` + +**Attributes:** + + - `msg`: Explanation of the error. + + + +### method `__init__` ```python -check_reconcile_interval(reconcile_interval: int) → int +__init__(msg: str) ``` -Validate the general charm configuration. +Initialize a new instance of the CharmConfigInvalidError exception. **Args:** - - `reconcile_interval`: The value of reconcile_interval passed to class instantiation. + - `msg`: Explanation of the error. -**Raises:** + + +--- + + + +## class `RepoPolicyComplianceConfig` +Configuration for the repo policy compliance service. + + + +**Attributes:** - - `ValueError`: if an invalid reconcile_interval value of less than 2 has been passed. + - `token`: Token for the repo policy compliance service. + - `url`: URL of the repo policy compliance service. -**Returns:** - The validated reconcile_interval value. --- - + ### classmethod `from_charm` ```python -from_charm(charm: CharmBase) → CharmConfig +from_charm(charm: CharmBase) → RepoPolicyComplianceConfig ``` Initialize the config from charm. @@ -187,121 +258,96 @@ Initialize the config from charm. **Raises:** - - `CharmConfigInvalidError`: If any invalid configuration has been set on the charm. + - `CharmConfigInvalidError`: If an invalid configuration was set. **Returns:** - Current config of the charm. + Current repo-policy-compliance config. --- -## class `CharmConfigInvalidError` -Raised when charm config is invalid. + + +## class `OpenStackCloudsYAML` +The OpenStack clouds YAML dict mapping. **Attributes:** - - `msg`: Explanation of the error. - - - -### function `__init__` - -```python -__init__(msg: str) -``` - -Initialize a new instance of the CharmConfigInvalidError exception. - + - `clouds`: The map of cloud name to cloud connection info. -**Args:** - - - `msg`: Explanation of the error. +--- + ---- +## class `CharmConfig` +General charm configuration. -## class `CharmState` -The charm state. +Some charm configurations are grouped into other configuration models. **Attributes:** - - `arch`: The underlying compute architecture, i.e. x86_64, amd64, arm64/aarch64. - - `charm_config`: Configuration of the juju charm. - - `is_metrics_logging_available`: Whether the charm is able to issue metrics. - - `proxy_config`: Proxy-related configuration. - - `instance_type`: The type of instances, e.g., local lxd, openstack. - - `runner_config`: The charm configuration related to runner VM configuration. - - `ssh_debug_connections`: SSH debug connections configuration information. + - `denylist`: List of IPv4 to block the runners from accessing. + - `dockerhub_mirror`: Private docker registry as dockerhub mirror for the runners to use. + - `labels`: Additional runner labels to append to default (i.e. os, flavor, architecture). + - `openstack_clouds_yaml`: The openstack clouds.yaml configuration. + - `path`: GitHub repository path in the format '/', or the GitHub organization name. + - `reconcile_interval`: Time between each reconciliation of runners in minutes. + - `repo_policy_compliance`: Configuration for the repo policy compliance service. + - `token`: GitHub personal access token for GitHub API. --- - + -### classmethod `from_charm` +### classmethod `check_reconcile_interval` ```python -from_charm(charm: CharmBase) → CharmState +check_reconcile_interval(reconcile_interval: int) → int ``` -Initialize the state from charm. +Validate the general charm configuration. **Args:** - - `charm`: The charm instance. + - `reconcile_interval`: The value of reconcile_interval passed to class instantiation. **Raises:** - - `CharmConfigInvalidError`: If an invalid configuration was set. + - `ValueError`: if an invalid reconcile_interval value of less than 2 has been passed. **Returns:** - Current state of the charm. - - ---- - -## class `GithubConfig` -Charm configuration related to GitHub. - - - -**Attributes:** - - - `token`: The Github API access token (PAT). - - `path`: The Github org/repo path. - - - + The validated reconcile_interval value. --- - + ### classmethod `from_charm` ```python -from_charm(charm: CharmBase) → GithubConfig +from_charm(charm: CharmBase) → CharmConfig ``` -Get github related charm configuration values from charm. +Initialize the config from charm. @@ -313,123 +359,128 @@ Get github related charm configuration values from charm. **Raises:** - - `CharmConfigInvalidError`: If an invalid configuration value was set. + - `CharmConfigInvalidError`: If any invalid configuration has been set on the charm. **Returns:** - The parsed GitHub configuration values. + Current config of the charm. --- -## class `GithubOrg` -Represent GitHub organization. + + +## class `BaseImage` +The ubuntu OS base image to build and deploy runners on. **Attributes:** - - `org`: Name of the GitHub organization. - - `group`: Runner group to spawn the runners in. + - `JAMMY`: The jammy ubuntu LTS image. + - `NOBLE`: The noble ubuntu LTS image. + --- - + -### function `path` +## class `OpenstackImage` +OpenstackImage from image builder relation data. -```python -path() → str -``` -Return a string representing the path. +**Attributes:** + + - `id`: The OpenStack image ID. + - `tags`: Image tags, e.g. jammy -**Returns:** - Path to the GitHub entity. --- -## class `GithubRepo` -Represent GitHub repository. + +### classmethod `from_charm` +```python +from_charm(charm: CharmBase) → OpenstackImage | None +``` -**Attributes:** +Initialize the OpenstackImage info from relation data. + +None represents relation not established. None values for id/tags represent image not yet ready but the relation exists. + + + +**Args:** - - `owner`: Owner of the GitHub repository. - - `repo`: Name of the GitHub repository. + - `charm`: The charm instance. +**Returns:** + OpenstackImage metadata from charm relation data. + --- - + -### function `path` +## class `OpenstackRunnerConfig` +Runner configuration for OpenStack Instances. -```python -path() → str -``` -Return a string representing the path. +**Attributes:** + + - `virtual_machines`: Number of virtual machine-based runner to spawn. + - `openstack_flavor`: flavor on openstack to use for virtual machines. + - `openstack_network`: Network on openstack to use for virtual machines. + - `openstack_image`: Openstack image to use for virtual machines. -**Returns:** - Path to the GitHub entity. --- -## class `ImmutableConfigChangedError` -Represents an error when changing immutable charm state. + - - -### function `__init__` +### classmethod `from_charm` ```python -__init__(msg: str) +from_charm(charm: CharmBase) → OpenstackRunnerConfig ``` -Initialize a new instance of the ImmutableConfigChangedError exception. +Initialize the config from charm. **Args:** - - `msg`: Explanation of the error. - - - - - ---- - -## class `InstanceType` -Type of instance for runner. + - `charm`: The charm instance. -**Attributes:** +**Raises:** - - `LOCAL_LXD`: LXD instance on the local juju machine. - - `OPENSTACK`: OpenStack instance on a cloud. + - `CharmConfigInvalidError`: Error with charm configuration virtual-machines not of int type. +**Returns:** + Openstack runner config of the charm. --- + + ## class `LocalLxdRunnerConfig` Runner configurations for local LXD instances. @@ -447,7 +498,7 @@ Runner configurations for local LXD instances. --- - + ### classmethod `check_virtual_machine_resources` @@ -478,7 +529,7 @@ Validate the virtual_machine_resources field values. --- - + ### classmethod `check_virtual_machines` @@ -507,7 +558,7 @@ Validate the virtual machines configuration value. --- - + ### classmethod `from_charm` @@ -537,73 +588,71 @@ Initialize the config from charm. --- -## class `OpenstackImage` -OpenstackImage from image builder relation data. + + +## class `ProxyConfig` +Proxy configuration. **Attributes:** - - `id`: The OpenStack image ID. - - `tags`: Image tags, e.g. jammy + - `aproxy_address`: The address of aproxy snap instance if use_aproxy is enabled. + - `http`: HTTP proxy address. + - `https`: HTTPS proxy address. + - `no_proxy`: Comma-separated list of hosts that should not be proxied. + - `use_aproxy`: Whether aproxy should be used for the runners. +--- + +#### property aproxy_address + +Return the aproxy address. + --- - + -### classmethod `from_charm` +### classmethod `check_use_aproxy` ```python -from_charm(charm: CharmBase) → OpenstackImage | None +check_use_aproxy(use_aproxy: bool, values: dict) → bool ``` -Initialize the OpenstackImage info from relation data. - -None represents relation not established. None values for id/tags represent image not yet ready but the relation exists. +Validate the proxy configuration. **Args:** - - `charm`: The charm instance. - - - -**Returns:** - OpenstackImage metadata from charm relation data. - - ---- - -## class `OpenstackRunnerConfig` -Runner configuration for OpenStack Instances. + - `use_aproxy`: Value of use_aproxy variable. + - `values`: Values in the pydantic model. -**Attributes:** +**Raises:** - - `virtual_machines`: Number of virtual machine-based runner to spawn. - - `openstack_flavor`: flavor on openstack to use for virtual machines. - - `openstack_network`: Network on openstack to use for virtual machines. - - `openstack_image`: Openstack image to use for virtual machines. + - `ValueError`: if use_aproxy was set but no http/https was passed. +**Returns:** + Validated use_aproxy value. --- - + ### classmethod `from_charm` ```python -from_charm(charm: CharmBase) → OpenstackRunnerConfig +from_charm(charm: CharmBase) → ProxyConfig ``` -Initialize the config from charm. +Initialize the proxy config from charm. @@ -613,81 +662,73 @@ Initialize the config from charm. -**Raises:** - - - `CharmConfigInvalidError`: Error with charm configuration virtual-machines not of int type. - - - **Returns:** - Openstack runner config of the charm. + Current proxy config of the charm. --- -## class `ProxyConfig` -Proxy configuration. + + +## class `UnsupportedArchitectureError` +Raised when given machine charm architecture is unsupported. **Attributes:** - - `aproxy_address`: The address of aproxy snap instance if use_aproxy is enabled. - - `http`: HTTP proxy address. - - `https`: HTTPS proxy address. - - `no_proxy`: Comma-separated list of hosts that should not be proxied. - - `use_aproxy`: Whether aproxy should be used for the runners. + - `arch`: The current machine architecture. + ---- +### method `__init__` -#### property aproxy_address +```python +__init__(arch: str) → None +``` -Return the aproxy address. +Initialize a new instance of the CharmConfigInvalidError exception. ---- +**Args:** + + - `arch`: The current machine architecture. - -### classmethod `check_use_aproxy` -```python -check_use_aproxy(use_aproxy: bool, values: dict) → bool -``` -Validate the proxy configuration. +--- + -**Args:** - - - `use_aproxy`: Value of use_aproxy variable. - - `values`: Values in the pydantic model. +## class `SSHDebugConnection` +SSH connection information for debug workflow. -**Raises:** +**Attributes:** - - `ValueError`: if use_aproxy was set but no http/https was passed. + - `host`: The SSH relay server host IP address inside the VPN. + - `port`: The SSH relay server port. + - `rsa_fingerprint`: The host SSH server public RSA key fingerprint. + - `ed25519_fingerprint`: The host SSH server public ed25519 key fingerprint. -**Returns:** - Validated use_aproxy value. --- - + ### classmethod `from_charm` ```python -from_charm(charm: CharmBase) → ProxyConfig +from_charm(charm: CharmBase) → list['SSHDebugConnection'] ``` -Initialize the proxy config from charm. +Initialize the SSHDebugInfo from charm relation data. @@ -698,65 +739,77 @@ Initialize the proxy config from charm. **Returns:** - Current proxy config of the charm. + List of connection information for ssh debug access. --- -## class `RepoPolicyComplianceConfig` -Configuration for the repo policy compliance service. + + +## class `ReactiveConfig` +Represents the configuration for reactive scheduling. **Attributes:** - - `token`: Token for the repo policy compliance service. - - `url`: URL of the repo policy compliance service. + - `mq_uri`: The URI of the MQ to use to spawn runners reactively. --- - + -### classmethod `from_charm` +### classmethod `from_database` ```python -from_charm(charm: CharmBase) → RepoPolicyComplianceConfig +from_database(database: DatabaseRequires) → ReactiveConfig | None ``` -Initialize the config from charm. +Initialize the ReactiveConfig from charm config and integration data. **Args:** - - `charm`: The charm instance. + - `database`: The database to fetch integration data from. + + + +**Returns:** + The connection information for the reactive MQ or None if not available. **Raises:** - - `CharmConfigInvalidError`: If an invalid configuration was set. + - `MissingMongoDBError`: If the information on howto access MongoDB is missing in the integration data. +--- -**Returns:** - Current repo-policy-compliance config. + +## class `ImmutableConfigChangedError` +Represents an error when changing immutable charm state. ---- + -## class `RunnerStorage` -Supported storage as runner disk. +### method `__init__` +```python +__init__(msg: str) +``` +Initialize a new instance of the ImmutableConfigChangedError exception. -**Attributes:** + + +**Args:** - - `JUJU_STORAGE`: Represents runner storage from Juju storage. - - `MEMORY`: Represents tempfs storage (ramdisk). + - `msg`: Explanation of the error. @@ -764,90 +817,76 @@ Supported storage as runner disk. --- -## class `SSHDebugConnection` -SSH connection information for debug workflow. + + +## class `CharmState` +The charm state. **Attributes:** - - `host`: The SSH relay server host IP address inside the VPN. - - `port`: The SSH relay server port. - - `rsa_fingerprint`: The host SSH server public RSA key fingerprint. - - `ed25519_fingerprint`: The host SSH server public ed25519 key fingerprint. - - - - ---- + - `arch`: The underlying compute architecture, i.e. x86_64, amd64, arm64/aarch64. + - `charm_config`: Configuration of the juju charm. + - `is_metrics_logging_available`: Whether the charm is able to issue metrics. + - `proxy_config`: Proxy-related configuration. + - `instance_type`: The type of instances, e.g., local lxd, openstack. + - `reactive_config`: The charm configuration related to reactive spawning mode. + - `runner_config`: The charm configuration related to runner VM configuration. + - `ssh_debug_connections`: SSH debug connections configuration information. - + -### classmethod `from_charm` +### method `__init__` ```python -from_charm(charm: CharmBase) → list['SSHDebugConnection'] +__init__( + arch: Arch, + is_metrics_logging_available: bool, + proxy_config: ProxyConfig, + instance_type: InstanceType, + charm_config: CharmConfig, + runner_config: OpenstackRunnerConfig | LocalLxdRunnerConfig, + reactive_config: ReactiveConfig | None, + ssh_debug_connections: list[SSHDebugConnection] +) → None ``` -Initialize the SSHDebugInfo from charm relation data. -**Args:** - - - `charm`: The charm instance. - -**Returns:** - List of connection information for ssh debug access. --- -## class `UnsupportedArchitectureError` -Raised when given machine charm architecture is unsupported. - + - -**Attributes:** - - - `arch`: The current machine architecture. - - - -### function `__init__` +### classmethod `from_charm` ```python -__init__(arch: str) → None +from_charm(charm: CharmBase, database: DatabaseRequires) → CharmState ``` -Initialize a new instance of the CharmConfigInvalidError exception. +Initialize the state from charm. **Args:** - - `arch`: The current machine architecture. - - - - - ---- - -## class `VirtualMachineResources` -Virtual machine resource configuration. + - `charm`: The charm instance. + - `database`: The database instance. -**Attributes:** +**Raises:** - - `cpu`: Number of vCPU for the virtual machine. - - `memory`: Amount of memory for the virtual machine. - - `disk`: Amount of disk for the virtual machine. + - `CharmConfigInvalidError`: If an invalid configuration was set. +**Returns:** + Current state of the charm. diff --git a/src-docs/errors.md b/src-docs/errors.md index ee5db5a11..c61dd8410 100644 --- a/src-docs/errors.md +++ b/src-docs/errors.md @@ -7,39 +7,6 @@ Errors used by the charm. ---- - - - -## class `RunnerError` -Generic runner error as base exception. - - - - - ---- - - - -## class `RunnerExecutionError` -Error for executing commands on runner. - - - - - ---- - - - -## class `RunnerFileLoadError` -Error for loading file on runner. - - - - - --- @@ -55,8 +22,8 @@ Error for runner creation failure. -## class `RunnerRemoveError` -Error for runner removal failure. +## class `RunnerFileLoadError` +Error for loading file on runner. @@ -66,8 +33,8 @@ Error for runner removal failure. -## class `RunnerStartError` -Error for runner start failure. +## class `RunnerRemoveError` +Error for runner removal failure. @@ -220,17 +187,6 @@ Represents an error raised when logrotate cannot be setup. -## class `MetricsStorageError` -Base class for all metrics storage errors. - - - - - ---- - - - ## class `SharedFilesystemError` Base class for all shared filesystem errors. @@ -240,51 +196,7 @@ Base class for all shared filesystem errors. --- - - -## class `CreateMetricsStorageError` -Represents an error when the metrics storage could not be created. - - - - - ---- - - - -## class `DeleteMetricsStorageError` -Represents an error when the metrics storage could not be deleted. - - - - - ---- - - - -## class `GetMetricsStorageError` -Represents an error when the metrics storage could not be retrieved. - - - - - ---- - - - -## class `QuarantineMetricsStorageError` -Represents an error when the metrics storage could not be quarantined. - - - - - ---- - - + ## class `SharedFilesystemMountError` Represents an error related to the mounting of the shared filesystem. @@ -295,84 +207,7 @@ Represents an error related to the mounting of the shared filesystem. --- - - -## class `RunnerMetricsError` -Base class for all runner metrics errors. - - - - - ---- - - - -## class `CorruptMetricDataError` -Represents an error with the data being corrupt. - - - - - ---- - - - -## class `GithubMetricsError` -Base class for all github metrics errors. - - - - - ---- - - - -## class `GithubClientError` -Base class for all github client errors. - - - - - ---- - - - -## class `GithubApiError` -Represents an error when the GitHub API returns an error. - - - - - ---- - - - -## class `TokenError` -Represents an error when the token is invalid or has not enough permissions. - - - - - ---- - - - -## class `JobNotFoundError` -Represents an error when the job could not be found on GitHub. - - - - - ---- - - + ## class `RunnerLogsError` Base class for all runner logs errors. @@ -381,58 +216,3 @@ Base class for all runner logs errors. ---- - - - -## class `OpenStackError` -Base class for OpenStack errors. - - - - - ---- - - - -## class `OpenStackInvalidConfigError` -Represents an invalid OpenStack configuration. - - - - - ---- - - - -## class `OpenStackUnauthorizedError` -Represents an unauthorized connection to OpenStack. - - - - - ---- - - - -## class `SSHError` -Represents an error while interacting with SSH. - - - - - ---- - - - -## class `KeyfileError` -Represents missing keyfile for SSH. - - - - - diff --git a/src-docs/github_client.md b/src-docs/github_client.md index fc0de8f7b..679c9f907 100644 --- a/src-docs/github_client.md +++ b/src-docs/github_client.md @@ -8,116 +8,20 @@ GitHub API client. Migrate to PyGithub in the future. PyGithub is still lacking some API such as remove token for runner. ---- - - - -## function `catch_http_errors` - -```python -catch_http_errors( - func: Callable[~ParamT, ~ReturnT] -) → Callable[~ParamT, ~ReturnT] -``` - -Catch HTTP errors and raise custom exceptions. - - - -**Args:** - - - `func`: The target function to catch common errors for. - - - -**Returns:** - The decorated function. - --- - + ## class `GithubClient` GitHub API client. - - -### method `__init__` - -```python -__init__(token: str) -``` - -Instantiate the GiHub API client. - - - -**Args:** - - - `token`: GitHub personal token for API requests. - --- - - -### method `delete_runner` - -```python -delete_runner(path: GitHubOrg | GitHubRepo, runner_id: int) → None -``` - -Delete the self-hosted runner from GitHub. - - - -**Args:** - - - `path`: GitHub repository path in the format '/', or the GitHub organization name. - - `runner_id`: Id of the runner. - ---- - - - -### method `get_job_info` - -```python -get_job_info( - path: GitHubRepo, - workflow_run_id: str, - runner_name: str -) → JobStats -``` - -Get information about a job for a specific workflow run. - - - -**Args:** - - - `path`: GitHub repository path in the format '/'. - - `workflow_run_id`: Id of the workflow run. - - `runner_name`: Name of the runner. - - - -**Raises:** - - - `TokenError`: if there was an error with the Github token crdential provided. - - `JobNotFoundError`: If no jobs were found. - - - -**Returns:** - Job information. - ---- - - + ### method `get_runner_application` @@ -150,73 +54,4 @@ Get runner application available for download for given arch. **Returns:** The runner application. ---- - - - -### method `get_runner_github_info` - -```python -get_runner_github_info(path: GitHubOrg | GitHubRepo) → list[SelfHostedRunner] -``` - -Get runner information on GitHub under a repo or org. - - - -**Args:** - - - `path`: GitHub repository path in the format '/', or the GitHub organization name. - - - -**Returns:** - List of runner information. - ---- - - - -### method `get_runner_registration_token` - -```python -get_runner_registration_token(path: GitHubOrg | GitHubRepo) → str -``` - -Get token from GitHub used for registering runners. - - - -**Args:** - - - `path`: GitHub repository path in the format '/', or the GitHub organization name. - - - -**Returns:** - The registration token. - ---- - - - -### method `get_runner_remove_token` - -```python -get_runner_remove_token(path: GitHubOrg | GitHubRepo) → str -``` - -Get token from GitHub used for removing runners. - - - -**Args:** - - - `path`: The Github org/repo path. - - - -**Returns:** - The removing token. - diff --git a/src-docs/runner.md b/src-docs/runner.md index b513ad697..d7bfb93c1 100644 --- a/src-docs/runner.md +++ b/src-docs/runner.md @@ -17,7 +17,7 @@ The `RunnerManager` class from `runner_manager.py` creates and manages a collect --- - + ## class `Snap` This class represents a snap installation. @@ -36,7 +36,7 @@ This class represents a snap installation. --- - + ## class `WgetExecutable` The executable to be installed through wget. @@ -66,7 +66,7 @@ __init__(url: str, cmd: str) → None --- - + ## class `CreateRunnerConfig` The configuration values for creating a single runner instance. @@ -105,7 +105,7 @@ __init__( --- - + ## class `Runner` Single instance of GitHub self-hosted runner. @@ -120,7 +120,7 @@ Single instance of GitHub self-hosted runner. - `runner_script`: The runner start script file path. - `pre_job_script`: The runner pre_job script file path. This is referenced in the env_file in the ACTIONS_RUNNER_HOOK_JOB_STARTED environment variable. - + ### method `__init__` @@ -149,7 +149,7 @@ Construct the runner instance. --- - + ### method `create` @@ -173,7 +173,7 @@ Create the runner instance on LXD and register it on GitHub. --- - + ### method `pull_logs` @@ -193,7 +193,7 @@ Expects the runner to have an instance. --- - + ### method `remove` diff --git a/src-docs/runner_manager.md b/src-docs/runner_manager.md index f52829efa..883745753 100644 --- a/src-docs/runner_manager.md +++ b/src-docs/runner_manager.md @@ -52,7 +52,7 @@ Construct RunnerManager object for creating and managing runners. --- - + ### method `build_runner_image` @@ -135,7 +135,7 @@ Get information on the runners from GitHub. --- - + ### method `get_latest_runner_bin_url` @@ -219,7 +219,7 @@ Install cron job for building runner image. --- - + ### method `update_runner_bin` diff --git a/src-docs/runner_type.md b/src-docs/runner_type.md index d5029f4f8..cde5b2a7e 100644 --- a/src-docs/runner_type.md +++ b/src-docs/runner_type.md @@ -9,7 +9,7 @@ Types used by Runner class. --- - + ## class `RunnerNameByHealth` Set of runners instance by health state. @@ -39,7 +39,7 @@ __init__(healthy: tuple[str, ], unhealthy: tuple[str, ]) → None --- - + ## class `ProxySetting` Represent HTTP-related proxy settings. @@ -76,7 +76,7 @@ __init__( --- - + ## class `RunnerConfig` Configuration for runner. @@ -123,7 +123,7 @@ __init__( --- - + ## class `RunnerStatus` Status of runner. @@ -160,7 +160,7 @@ __init__( --- - + ## class `RunnerGithubInfo` GitHub info of a runner. diff --git a/src-docs/shared_fs.md b/src-docs/shared_fs.md index 004556f9f..5ae59a8ca 100644 --- a/src-docs/shared_fs.md +++ b/src-docs/shared_fs.md @@ -13,7 +13,7 @@ Classes and functions to operate on the shared filesystem between the charm and --- - + ## function `create` @@ -45,7 +45,7 @@ The method is not idempotent and will raise an exception if the shared filesyste --- - + ## function `list_all` @@ -63,7 +63,7 @@ List all the metric storages. --- - + ## function `get` @@ -95,7 +95,7 @@ Mounts the filesystem if it is not currently mounted. --- - + ## function `delete` diff --git a/src-docs/utilities.md b/src-docs/utilities.md index 6c2aab4e1..b2c4cbf21 100644 --- a/src-docs/utilities.md +++ b/src-docs/utilities.md @@ -8,77 +8,7 @@ Utilities used by the charm. --- - - -## function `retry` - -```python -retry( - exception: Type[Exception] = , - tries: int = 1, - delay: float = 0, - max_delay: Optional[float] = None, - backoff: float = 1, - local_logger: Logger = -) → Callable[[Callable[~ParamT, ~ReturnT]], Callable[~ParamT, ~ReturnT]] -``` - -Parameterize the decorator for adding retry to functions. - - - -**Args:** - - - `exception`: Exception type to be retried. - - `tries`: Number of attempts at retry. - - `delay`: Time in seconds to wait between retry. - - `max_delay`: Max time in seconds to wait between retry. - - `backoff`: Factor to increase the delay by each retry. - - `local_logger`: Logger for logging. - - - -**Returns:** - The function decorator for retry. - - ---- - - - -## function `secure_run_subprocess` - -```python -secure_run_subprocess( - cmd: Sequence[str], - hide_cmd: bool = False, - **kwargs: dict[str, Any] -) → CompletedProcess[bytes] -``` - -Run command in subprocess according to security recommendations. - -CalledProcessError will not be raised on error of the command executed. Errors should be handled by the caller by checking the exit code. - -The command is executed with `subprocess.run`, additional arguments can be passed to it as keyword arguments. The following arguments to `subprocess.run` should not be set: `capture_output`, `shell`, `check`. As those arguments are used by this function. - - - -**Args:** - - - `cmd`: Command in a list. - - `hide_cmd`: Hide logging of cmd. - - `kwargs`: Additional keyword arguments for the `subprocess.run` call. - - - -**Returns:** - Object representing the completed process. The outputs subprocess can accessed. - - ---- - - + ## function `execute_command` @@ -118,7 +48,7 @@ The output is logged if the log level of the logger is set to debug. --- - + ## function `get_env_var` @@ -144,29 +74,7 @@ Looks for all upper-case and all low-case of the `env_var`. --- - - -## function `set_env_var` - -```python -set_env_var(env_var: str, value: str) → None -``` - -Set the environment variable value. - -Set the all upper case and all low case of the `env_var`. - - - -**Args:** - - - `env_var`: Name of the environment variable. - - `value`: Value to set environment variable to. - - ---- - - + ## function `bytes_with_unit_to_kib` @@ -196,7 +104,7 @@ Convert a positive integer followed by a unit to number of kibibytes. --- - + ## function `remove_residual_venv_dirs` diff --git a/src/charm.py b/src/charm.py index c60c62bea..0b17dbf52 100755 --- a/src/charm.py +++ b/src/charm.py @@ -7,10 +7,23 @@ # pylint: disable=too-many-lines """Charm for creating and managing GitHub self-hosted runner instances.""" +from github_runner_manager.manager.cloud_runner_manager import ( + GitHubRunnerConfig, + SupportServiceConfig, +) +from github_runner_manager.manager.runner_manager import ( + FlushMode, + RunnerManager, + RunnerManagerConfig, +) +from github_runner_manager.manager.runner_scaler import RunnerScaler +from github_runner_manager.openstack_cloud.openstack_runner_manager import ( + OpenStackCloudConfig, + OpenStackRunnerManager, + OpenStackServerConfig, +) +from github_runner_manager.types_.github import GitHubPath, GitHubRunnerStatus, parse_github_path -from manager.cloud_runner_manager import GitHubRunnerConfig, SupportServiceConfig -from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig -from manager.runner_scaler import RunnerScaler from utilities import bytes_with_unit_to_kib, execute_command, remove_residual_venv_dirs, retry # This is a workaround for https://bugs.launchpad.net/juju/+bug/2058335 @@ -59,20 +72,17 @@ TOKEN_CONFIG_NAME, CharmConfigInvalidError, CharmState, - GitHubPath, InstanceType, OpenstackImage, ProxyConfig, RunnerStorage, VirtualMachineResources, - parse_github_path, ) from errors import ( ConfigurationError, LogrotateSetupError, MissingMongoDBError, MissingRunnerBinaryError, - OpenStackUnauthorizedError, RunnerBinaryError, RunnerError, SubprocessError, @@ -80,12 +90,6 @@ ) from event_timer import EventTimer, TimerStatusError from firewall import Firewall, FirewallEntry -from github_type import GitHubRunnerStatus -from openstack_cloud.openstack_runner_manager import ( - OpenStackCloudConfig, - OpenStackRunnerManager, - OpenStackServerConfig, -) from runner import LXD_PROFILE_YAML from runner_manager import LXDRunnerManager, LXDRunnerManagerConfig from runner_manager_type import LXDFlushMode @@ -140,11 +144,6 @@ def func_with_catch_errors(self: "GithubRunnerCharm", event: EventT) -> None: "GitHub runner application not downloaded; the charm will retry download on " "reconcile interval" ) - except OpenStackUnauthorizedError: - logger.exception("Unauthorized OpenStack connection") - self.unit.status = BlockedStatus( - "Unauthorized OpenStack connection. Check credentials." - ) except MissingMongoDBError as err: logger.exception("Missing integration data") self.unit.status = WaitingStatus(str(err)) diff --git a/src/charm_state.py b/src/charm_state.py index dcd87d122..6ae46d386 100644 --- a/src/charm_state.py +++ b/src/charm_state.py @@ -19,6 +19,9 @@ import yaml from charms.data_platform_libs.v0.data_interfaces import DatabaseRequires +from github_runner_manager import openstack_cloud +from github_runner_manager.errors import OpenStackInvalidConfigError +from github_runner_manager.types_.github import GitHubPath, parse_github_path from ops import CharmBase from pydantic import ( AnyHttpUrl, @@ -31,8 +34,7 @@ validator, ) -import openstack_cloud -from errors import MissingMongoDBError, OpenStackInvalidConfigError +from errors import MissingMongoDBError from firewall import FirewallEntry from utilities import get_env_var @@ -87,75 +89,6 @@ class AnyHttpsUrl(AnyHttpUrl): allowed_schemes = {"https"} -@dataclasses.dataclass -class GitHubRepo: - """Represent GitHub repository. - - Attributes: - owner: Owner of the GitHub repository. - repo: Name of the GitHub repository. - """ - - owner: str - repo: str - - def path(self) -> str: - """Return a string representing the path. - - Returns: - Path to the GitHub entity. - """ - return f"{self.owner}/{self.repo}" - - -@dataclasses.dataclass -class GitHubOrg: - """Represent GitHub organization. - - Attributes: - org: Name of the GitHub organization. - group: Runner group to spawn the runners in. - """ - - org: str - group: str - - def path(self) -> str: - """Return a string representing the path. - - Returns: - Path to the GitHub entity. - """ - return self.org - - -GitHubPath = GitHubOrg | GitHubRepo - - -def parse_github_path(path_str: str, runner_group: str) -> GitHubPath: - """Parse GitHub path. - - Args: - path_str: GitHub path in string format. - runner_group: Runner group name for GitHub organization. If the path is - a repository this argument is ignored. - - Raises: - CharmConfigInvalidError: if an invalid path string was given. - - Returns: - GithubPath object representing the GitHub repository, or the GitHub - organization with runner group information. - """ - if "/" in path_str: - paths = tuple(segment for segment in path_str.split("/") if segment) - if len(paths) != 2: - raise CharmConfigInvalidError(f"Invalid path configuration {path_str}") - owner, repo = paths - return GitHubRepo(owner=owner, repo=repo) - return GitHubOrg(org=path_str, group=runner_group) - - @dataclasses.dataclass class GithubConfig: """Charm configuration related to GitHub. @@ -186,7 +119,10 @@ def from_charm(cls, charm: CharmBase) -> "GithubConfig": path_str = cast(str, charm.config.get(PATH_CONFIG_NAME, "")) if not path_str: raise CharmConfigInvalidError(f"Missing {PATH_CONFIG_NAME} configuration") - path = parse_github_path(cast(str, path_str), cast(str, runner_group)) + try: + path = parse_github_path(cast(str, path_str), cast(str, runner_group)) + except ValueError as e: + raise CharmConfigInvalidError(str(e)) from e token = cast(str, charm.config.get(TOKEN_CONFIG_NAME)) if not token: diff --git a/src/errors.py b/src/errors.py index 4285dc6e4..7212b4642 100644 --- a/src/errors.py +++ b/src/errors.py @@ -6,31 +6,31 @@ from typing import Union +# we import the errors from the module, these are used in the charm +from github_runner_manager.errors import ( # noqa: F401 pylint: disable=unused-import + CreateMetricsStorageError, + DeleteMetricsStorageError, + GetMetricsStorageError, + GithubClientError, + GithubMetricsError, + MetricsStorageError, + RunnerError, + TokenError, +) -class RunnerError(Exception): - """Generic runner error as base exception.""" - -class RunnerExecutionError(RunnerError): - """Error for executing commands on runner.""" +class RunnerCreateError(RunnerError): + """Error for runner creation failure.""" class RunnerFileLoadError(RunnerError): """Error for loading file on runner.""" -class RunnerCreateError(RunnerError): - """Error for runner creation failure.""" - - class RunnerRemoveError(RunnerError): """Error for runner removal failure.""" -class RunnerStartError(RunnerError): - """Error for runner start failure.""" - - class RunnerBinaryError(RunnerError): """Error of getting runner binary.""" @@ -100,81 +100,13 @@ class LogrotateSetupError(Exception): """Represents an error raised when logrotate cannot be setup.""" -class MetricsStorageError(Exception): - """Base class for all metrics storage errors.""" - - class SharedFilesystemError(MetricsStorageError): """Base class for all shared filesystem errors.""" -class CreateMetricsStorageError(MetricsStorageError): - """Represents an error when the metrics storage could not be created.""" - - -class DeleteMetricsStorageError(MetricsStorageError): - """Represents an error when the metrics storage could not be deleted.""" - - -class GetMetricsStorageError(MetricsStorageError): - """Represents an error when the metrics storage could not be retrieved.""" - - -class QuarantineMetricsStorageError(MetricsStorageError): - """Represents an error when the metrics storage could not be quarantined.""" - - class SharedFilesystemMountError(SharedFilesystemError): """Represents an error related to the mounting of the shared filesystem.""" -class RunnerMetricsError(Exception): - """Base class for all runner metrics errors.""" - - -class CorruptMetricDataError(RunnerMetricsError): - """Represents an error with the data being corrupt.""" - - -class GithubMetricsError(Exception): - """Base class for all github metrics errors.""" - - -class GithubClientError(Exception): - """Base class for all github client errors.""" - - -class GithubApiError(GithubClientError): - """Represents an error when the GitHub API returns an error.""" - - -class TokenError(GithubClientError): - """Represents an error when the token is invalid or has not enough permissions.""" - - -class JobNotFoundError(GithubClientError): - """Represents an error when the job could not be found on GitHub.""" - - class RunnerLogsError(Exception): """Base class for all runner logs errors.""" - - -class OpenStackError(Exception): - """Base class for OpenStack errors.""" - - -class OpenStackInvalidConfigError(OpenStackError): - """Represents an invalid OpenStack configuration.""" - - -class OpenStackUnauthorizedError(OpenStackError): - """Represents an unauthorized connection to OpenStack.""" - - -class SSHError(Exception): - """Represents an error while interacting with SSH.""" - - -class KeyfileError(SSHError): - """Represents missing keyfile for SSH.""" diff --git a/src/github_client.py b/src/github_client.py index b724b5cdb..b14d3b799 100644 --- a/src/github_client.py +++ b/src/github_client.py @@ -6,27 +6,22 @@ Migrate to PyGithub in the future. PyGithub is still lacking some API such as remove token for runner. """ -import functools import logging -from datetime import datetime -from typing import Callable, ParamSpec, TypeVar -from urllib.error import HTTPError - -from ghapi.all import GhApi, pages -from ghapi.page import paged -from typing_extensions import assert_never - -from charm_state import Arch, GitHubOrg, GitHubPath, GitHubRepo -from errors import GithubApiError, JobNotFoundError, RunnerBinaryError, TokenError -from github_type import ( - JobStats, - RegistrationToken, - RemoveToken, +from typing import ParamSpec, TypeVar + +from github_runner_manager.github_client import GithubClient as GitHubRunnerManagerGitHubClient +from github_runner_manager.github_client import catch_http_errors +from github_runner_manager.types_.github import ( + GitHubOrg, + GitHubPath, + GitHubRepo, RunnerApplication, RunnerApplicationList, - SelfHostedRunner, ) +from charm_state import Arch +from errors import RunnerBinaryError + logger = logging.getLogger(__name__) # Parameters of the function decorated with retry @@ -35,57 +30,9 @@ ReturnT = TypeVar("ReturnT") -def catch_http_errors(func: Callable[ParamT, ReturnT]) -> Callable[ParamT, ReturnT]: - """Catch HTTP errors and raise custom exceptions. - - Args: - func: The target function to catch common errors for. - - Returns: - The decorated function. - """ - - @functools.wraps(func) - def wrapper(*args: ParamT.args, **kwargs: ParamT.kwargs) -> ReturnT: - """Catch common errors when using the GitHub API. - - Args: - args: Placeholder for positional arguments. - kwargs: Placeholder for keyword arguments. - - Raises: - TokenError: If there was an error with the provided token. - GithubApiError: If there was an unexpected error using the GitHub API. - - Returns: - The decorated function. - """ - try: - return func(*args, **kwargs) - except HTTPError as exc: - if exc.code in (401, 403): - if exc.code == 401: - msg = "Invalid token." - else: - msg = "Provided token has not enough permissions or has reached rate-limit." - raise TokenError(msg) from exc - raise GithubApiError from exc - - return wrapper - - -class GithubClient: +class GithubClient(GitHubRunnerManagerGitHubClient): """GitHub API client.""" - def __init__(self, token: str): - """Instantiate the GiHub API client. - - Args: - token: GitHub personal token for API requests. - """ - self._token = token - self._client = GhApi(token=self._token) - @catch_http_errors def get_runner_application( self, path: GitHubPath, arch: Arch, os: str = "linux" @@ -125,176 +72,3 @@ def get_runner_application( raise RunnerBinaryError( f"Unable query GitHub runner binary information for {os} {arch}" ) from err - - @catch_http_errors - def get_runner_github_info(self, path: GitHubPath) -> list[SelfHostedRunner]: - """Get runner information on GitHub under a repo or org. - - Args: - path: GitHub repository path in the format '/', or the GitHub organization - name. - - Returns: - List of runner information. - """ - remote_runners_list: list[SelfHostedRunner] = [] - - if isinstance(path, GitHubRepo): - # The documentation of ghapi for pagination is incorrect and examples will give errors. - # This workaround is a temp solution. Will be moving to PyGitHub in the future. - self._client.actions.list_self_hosted_runners_for_repo( - owner=path.owner, repo=path.repo, per_page=100 - ) - num_of_pages = self._client.last_page() - remote_runners_list = [ - item - for page in pages( - self._client.actions.list_self_hosted_runners_for_repo, - num_of_pages + 1, - owner=path.owner, - repo=path.repo, - per_page=100, - ) - for item in page["runners"] - ] - if isinstance(path, GitHubOrg): - # The documentation of ghapi for pagination is incorrect and examples will give errors. - # This workaround is a temp solution. Will be moving to PyGitHub in the future. - self._client.actions.list_self_hosted_runners_for_org(org=path.org, per_page=100) - num_of_pages = self._client.last_page() - remote_runners_list = [ - item - for page in pages( - self._client.actions.list_self_hosted_runners_for_org, - num_of_pages + 1, - org=path.org, - per_page=100, - ) - for item in page["runners"] - ] - return remote_runners_list - - @catch_http_errors - def get_runner_remove_token(self, path: GitHubPath) -> str: - """Get token from GitHub used for removing runners. - - Args: - path: The Github org/repo path. - - Returns: - The removing token. - """ - token: RemoveToken - if isinstance(path, GitHubRepo): - token = self._client.actions.create_remove_token_for_repo( - owner=path.owner, repo=path.repo - ) - elif isinstance(path, GitHubOrg): - token = self._client.actions.create_remove_token_for_org(org=path.org) - else: - assert_never(token) - - return token["token"] - - @catch_http_errors - def get_runner_registration_token(self, path: GitHubPath) -> str: - """Get token from GitHub used for registering runners. - - Args: - path: GitHub repository path in the format '/', or the GitHub organization - name. - - Returns: - The registration token. - """ - token: RegistrationToken - if isinstance(path, GitHubRepo): - token = self._client.actions.create_registration_token_for_repo( - owner=path.owner, repo=path.repo - ) - elif isinstance(path, GitHubOrg): - token = self._client.actions.create_registration_token_for_org(org=path.org) - else: - assert_never(token) - - return token["token"] - - @catch_http_errors - def delete_runner(self, path: GitHubPath, runner_id: int) -> None: - """Delete the self-hosted runner from GitHub. - - Args: - path: GitHub repository path in the format '/', or the GitHub organization - name. - runner_id: Id of the runner. - """ - if isinstance(path, GitHubRepo): - self._client.actions.delete_self_hosted_runner_from_repo( - owner=path.owner, - repo=path.repo, - runner_id=runner_id, - ) - if isinstance(path, GitHubOrg): - self._client.actions.delete_self_hosted_runner_from_org( - org=path.org, - runner_id=runner_id, - ) - - def get_job_info(self, path: GitHubRepo, workflow_run_id: str, runner_name: str) -> JobStats: - """Get information about a job for a specific workflow run. - - Args: - path: GitHub repository path in the format '/'. - workflow_run_id: Id of the workflow run. - runner_name: Name of the runner. - - Raises: - TokenError: if there was an error with the Github token crdential provided. - JobNotFoundError: If no jobs were found. - - Returns: - Job information. - """ - paged_kwargs = {"owner": path.owner, "repo": path.repo, "run_id": workflow_run_id} - try: - for wf_run_page in paged( - self._client.actions.list_jobs_for_workflow_run, **paged_kwargs - ): - jobs = wf_run_page["jobs"] - # ghapi performs endless pagination, - # so we have to break out of the loop if there are no more jobs - if not jobs: - break - for job in jobs: - if job["runner_name"] == runner_name: - # datetime strings should be in ISO 8601 format, - # but they can also use Z instead of - # +00:00, which is not supported by datetime.fromisoformat - created_at = datetime.fromisoformat( - job["created_at"].replace("Z", "+00:00") - ) - started_at = datetime.fromisoformat( - job["started_at"].replace("Z", "+00:00") - ) - # conclusion could be null per api schema, so we need to handle that - # though we would assume that it should always be present, - # as the job should be finished - conclusion = job.get("conclusion", None) - - job_id = job["id"] - return JobStats( - job_id=job_id, - created_at=created_at, - started_at=started_at, - conclusion=conclusion, - ) - - except HTTPError as exc: - if exc.code in (401, 403): - raise TokenError from exc - raise JobNotFoundError( - f"Could not find job for runner {runner_name}. " - f"Could not list jobs for workflow run {workflow_run_id}" - ) from exc - - raise JobNotFoundError(f"Could not find job for runner {runner_name}.") diff --git a/src/github_type.py b/src/github_type.py deleted file mode 100644 index a26a0279a..000000000 --- a/src/github_type.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Return type for the GitHub web API.""" - - -from __future__ import annotations - -from datetime import datetime -from enum import Enum -from typing import List, Literal, Optional, TypedDict - -from pydantic import BaseModel -from typing_extensions import NotRequired - - -class GitHubRunnerStatus(str, Enum): - """Status of runner on GitHub. - - Attributes: - ONLINE: Represents an online runner status. - OFFLINE: Represents an offline runner status. - """ - - ONLINE = "online" - OFFLINE = "offline" - - -# See response schema for -# https://docs.github.com/en/rest/actions/self-hosted-runners?apiVersion=2022-11-28#list-runner-applications-for-an-organization -class RunnerApplication(TypedDict, total=False): - """Information on the runner application. - - Attributes: - os: Operating system to run the runner application on. - architecture: Computer Architecture to run the runner application on. - download_url: URL to download the runner application. - filename: Filename of the runner application. - temp_download_token: A short lived bearer token used to download the - runner, if needed. - sha256_checksum: SHA256 Checksum of the runner application. - """ - - os: Literal["linux", "win", "osx"] - architecture: Literal["arm", "arm64", "x64"] - download_url: str - filename: str - temp_download_token: NotRequired[str] - sha256_checksum: NotRequired[str] - - -RunnerApplicationList = List[RunnerApplication] - - -class SelfHostedRunnerLabel(TypedDict, total=False): - """A single label of self-hosted runners. - - Attributes: - id: Unique identifier of the label. - name: Name of the label. - type: Type of label. Read-only labels are applied automatically when - the runner is configured. - """ - - id: NotRequired[int] - name: str - type: NotRequired[str] - - -class SelfHostedRunner(TypedDict): - """Information on a single self-hosted runner. - - Attributes: - busy: Whether the runner is executing a job. - id: Unique identifier of the runner. - labels: Labels of the runner. - os: Operation system of the runner. - name: Name of the runner. - status: The Github runner status. - """ - - busy: bool - id: int - labels: list[SelfHostedRunnerLabel] - os: str - name: str - status: GitHubRunnerStatus - - -class SelfHostedRunnerList(TypedDict): - """Information on a collection of self-hosted runners. - - Attributes: - total_count: Total number of runners. - runners: List of runners. - """ - - total_count: int - runners: list[SelfHostedRunner] - - -class RegistrationToken(TypedDict): - """Token used for registering GitHub runners. - - Attributes: - token: Token for registering GitHub runners. - expires_at: Time the token expires at. - """ - - token: str - expires_at: str - - -class RemoveToken(TypedDict): - """Token used for removing GitHub runners. - - Attributes: - token: Token for removing GitHub runners. - expires_at: Time the token expires at. - """ - - token: str - expires_at: str - - -class JobConclusion(str, Enum): - """Conclusion of a job on GitHub. - - See :https://docs.github.com/en/rest/actions/workflow-runs?apiVersion=2022-11-28\ -#list-workflow-runs-for-a-repository - - Attributes: - ACTION_REQUIRED: Represents additional action required on the job. - CANCELLED: Represents a cancelled job status. - FAILURE: Represents a failed job status. - NEUTRAL: Represents a job status that can optionally succeed or fail. - SKIPPED: Represents a skipped job status. - SUCCESS: Represents a successful job status. - TIMED_OUT: Represents a job that has timed out. - """ - - ACTION_REQUIRED = "action_required" - CANCELLED = "cancelled" - FAILURE = "failure" - NEUTRAL = "neutral" - SKIPPED = "skipped" - SUCCESS = "success" - TIMED_OUT = "timed_out" - - -class JobStats(BaseModel): - """Stats for a job on GitHub. - - Attributes: - job_id: The ID of the job. - created_at: The time the job was created. - started_at: The time the job was started. - conclusion: The end result of a job. - """ - - job_id: int - created_at: datetime - started_at: datetime - conclusion: Optional[JobConclusion] diff --git a/src/logrotate.py b/src/logrotate.py index 0fd65d5af..294c651dd 100644 --- a/src/logrotate.py +++ b/src/logrotate.py @@ -6,11 +6,11 @@ from pathlib import Path from charms.operator_libs_linux.v1 import systemd +from github_runner_manager.metrics.events import METRICS_LOG_PATH +from github_runner_manager.reactive.runner_manager import REACTIVE_RUNNER_LOG_DIR from pydantic import BaseModel from errors import LogrotateSetupError -from metrics.events import METRICS_LOG_PATH -from reactive.runner_manager import REACTIVE_RUNNER_LOG_DIR LOG_ROTATE_TIMER_SYSTEMD_SERVICE = "logrotate.timer" diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py deleted file mode 100644 index aff75ed41..000000000 --- a/src/manager/cloud_runner_manager.py +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Interface of manager of runner instance on clouds.""" - -import abc -import logging -from dataclasses import dataclass -from enum import Enum, auto -from typing import Iterator, Sequence, Tuple - -from charm_state import GitHubPath, ProxyConfig, RepoPolicyComplianceConfig, SSHDebugConnection -from metrics.runner import RunnerMetrics - -logger = logging.getLogger(__name__) - -InstanceId = str - - -class HealthState(Enum): - """Health state of the runners. - - Attributes: - HEALTHY: The runner is healthy. - UNHEALTHY: The runner is not healthy. - UNKNOWN: Unable to get the health state. - """ - - HEALTHY = auto() - UNHEALTHY = auto() - UNKNOWN = auto() - - -class CloudRunnerState(str, Enum): - """Represent state of the instance hosting the runner. - - Attributes: - CREATED: The instance is created. - ACTIVE: The instance is active and running. - DELETED: The instance is deleted. - ERROR: The instance has encountered error and not running. - STOPPED: The instance has stopped. - UNKNOWN: The state of the instance is not known. - UNEXPECTED: An unknown state not accounted by the developer is encountered. - """ - - CREATED = auto() - ACTIVE = auto() - DELETED = auto() - ERROR = auto() - STOPPED = auto() - UNKNOWN = auto() - UNEXPECTED = auto() - - # Exclude from coverage as not much value for testing this object conversion. - @staticmethod - def from_openstack_server_status( # pragma: no cover - openstack_server_status: str, - ) -> "CloudRunnerState": - """Create from openstack server status. - - The openstack server status are documented here: - https://docs.openstack.org/api-guide/compute/server_concepts.html - - Args: - openstack_server_status: Openstack server status. - - Returns: - The state of the runner. - """ - state = CloudRunnerState.UNEXPECTED - match openstack_server_status: - case "BUILD": - state = CloudRunnerState.CREATED - case "REBUILD": - state = CloudRunnerState.CREATED - case "ACTIVE": - state = CloudRunnerState.ACTIVE - case "ERROR": - state = CloudRunnerState.ERROR - case "STOPPED": - state = CloudRunnerState.STOPPED - case "DELETED": - state = CloudRunnerState.DELETED - case "UNKNOWN": - state = CloudRunnerState.UNKNOWN - case _: - state = CloudRunnerState.UNEXPECTED - return state - - -@dataclass -class GitHubRunnerConfig: - """Configuration for GitHub runner spawned. - - Attributes: - github_path: The GitHub organization or repository for runners to connect to. - labels: The labels to add to runners. - """ - - github_path: GitHubPath - labels: list[str] - - -@dataclass -class SupportServiceConfig: - """Configuration for supporting services for runners. - - Attributes: - proxy_config: The proxy configuration. - dockerhub_mirror: The dockerhub mirror to use for runners. - ssh_debug_connections: The information on the ssh debug services. - repo_policy_compliance: The configuration of the repo policy compliance service. - """ - - proxy_config: ProxyConfig | None - dockerhub_mirror: str | None - ssh_debug_connections: list[SSHDebugConnection] | None - repo_policy_compliance: RepoPolicyComplianceConfig | None - - -@dataclass -class CloudRunnerInstance: - """Information on the runner on the cloud. - - Attributes: - name: Name of the instance hosting the runner. - instance_id: ID of the instance. - health: Health state of the runner. - state: State of the instance hosting the runner. - """ - - name: str - instance_id: InstanceId - health: HealthState - state: CloudRunnerState - - -class CloudRunnerManager(abc.ABC): - """Manage runner instance on cloud. - - Attributes: - name_prefix: The name prefix of the self-hosted runners. - """ - - @property - @abc.abstractmethod - def name_prefix(self) -> str: - """Get the name prefix of the self-hosted runners.""" - - @abc.abstractmethod - def create_runner(self, registration_token: str) -> InstanceId: - """Create a self-hosted runner. - - Args: - registration_token: The GitHub registration token for registering runners. - """ - - @abc.abstractmethod - def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: - """Get a self-hosted runner by instance id. - - Args: - instance_id: The instance id. - """ - - @abc.abstractmethod - def get_runners(self, states: Sequence[CloudRunnerState]) -> Tuple[CloudRunnerInstance]: - """Get self-hosted runners by state. - - Args: - states: Filter for the runners with these github states. If None all states will be - included. - """ - - @abc.abstractmethod - def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMetrics | None: - """Delete self-hosted runner. - - Args: - instance_id: The instance id of the runner to delete. - remove_token: The GitHub remove token. - """ - - @abc.abstractmethod - def flush_runners(self, remove_token: str, busy: bool = False) -> Iterator[RunnerMetrics]: - """Stop all runners. - - Args: - remove_token: The GitHub remove token for removing runners. - busy: If false, only idle runners are removed. If true, both idle and busy runners are - removed. - """ - - @abc.abstractmethod - def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: - """Cleanup runner and resource on the cloud. - - Perform health check on runner and delete the runner if it fails. - - Args: - remove_token: The GitHub remove token for removing runners. - """ diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py deleted file mode 100644 index 949a1df38..000000000 --- a/src/manager/github_runner_manager.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Client for managing self-hosted runner on GitHub side.""" - -from enum import Enum, auto -from typing import Iterable - -from charm_state import GitHubPath -from github_client import GithubClient -from github_type import GitHubRunnerStatus, SelfHostedRunner - - -class GitHubRunnerState(str, Enum): - """State of the self-hosted runner on GitHub. - - Attributes: - BUSY: Runner is working on a job assigned by GitHub. - IDLE: Runner is waiting to take a job or is running pre-job tasks (i.e. - repo-policy-compliance check). - OFFLINE: Runner is not connected to GitHub. - """ - - BUSY = auto() - IDLE = auto() - OFFLINE = auto() - - @staticmethod - def from_runner(runner: SelfHostedRunner) -> "GitHubRunnerState": - """Construct the object from GtiHub runner information. - - Args: - runner: Information on the GitHub self-hosted runner. - - Returns: - The state of runner. - """ - state = GitHubRunnerState.OFFLINE - # A runner that is busy and offline is possible. - if runner["busy"]: - state = GitHubRunnerState.BUSY - if runner["status"] == GitHubRunnerStatus.ONLINE: - if not runner["busy"]: - state = GitHubRunnerState.IDLE - return state - - -# Thin wrapper around the GitHub Client. Not much value in unit testing. -class GitHubRunnerManager: # pragma: no cover - """Manage self-hosted runner on GitHub side.""" - - def __init__(self, prefix: str, token: str, path: GitHubPath): - """Construct the object. - - Args: - prefix: The prefix in the name to identify the runners managed by this instance. - token: The GitHub personal access token to access the GitHub API. - path: The GitHub repository or organization to register the runners under. - """ - self._prefix = prefix - self._path = path - self.github = GithubClient(token) - - def get_runners( - self, states: Iterable[GitHubRunnerState] | None = None - ) -> tuple[SelfHostedRunner, ...]: - """Get info on self-hosted runners of certain states. - - Args: - states: Filter the runners for these states. If None, all runners are returned. - - Returns: - Information on the runners. - """ - runner_list = self.github.get_runner_github_info(self._path) - runner_list = [runner for runner in runner_list if runner.name.startswith(self._prefix)] - - if states is None: - return tuple(runner_list) - - state_set = set(states) - return tuple( - runner - for runner in runner_list - if GitHubRunnerManager._is_runner_in_state(runner, state_set) - ) - - def delete_runners(self, states: Iterable[GitHubRunnerState] | None = None) -> None: - """Delete the self-hosted runners of certain states. - - Args: - states: Filter the runners for these states. If None, all runners are deleted. - """ - runner_list = self.get_runners(states) - for runner in runner_list: - self.github.delete_runner(self._path, runner.id) - - def get_registration_token(self) -> str: - """Get registration token from GitHub. - - This token is used for registering self-hosted runners. - - Returns: - The registration token. - """ - return self.github.get_runner_registration_token(self._path) - - def get_removal_token(self) -> str: - """Get removal token from GitHub. - - This token is used for removing self-hosted runners. - - Returns: - The removal token. - """ - return self.github.get_runner_remove_token(self._path) - - @staticmethod - def _is_runner_in_state( - runner: SelfHostedRunner, states: set[GitHubRunnerState] | None - ) -> bool: - """Check that the runner is in one of the states provided. - - Args: - runner: Runner to filter. - states: States in which to check the runner belongs to. - - Returns: - True if the runner is in one of the state, else false. - """ - if states is None: - return True - return GitHubRunnerState.from_runner(runner) in states diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py deleted file mode 100644 index 72ded77fb..000000000 --- a/src/manager/runner_manager.py +++ /dev/null @@ -1,364 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Class for managing the GitHub self-hosted runners hosted on cloud instances.""" - -import logging -from dataclasses import dataclass -from enum import Enum, auto -from multiprocessing import Pool -from typing import Iterator, Sequence, Type, cast - -from charm_state import GitHubPath -from errors import GithubMetricsError, RunnerCreateError -from github_type import SelfHostedRunner -from manager.cloud_runner_manager import ( - CloudRunnerInstance, - CloudRunnerManager, - CloudRunnerState, - HealthState, - InstanceId, -) -from manager.github_runner_manager import GitHubRunnerManager, GitHubRunnerState -from metrics import events as metric_events -from metrics import github as github_metrics -from metrics import runner as runner_metrics -from metrics.runner import RunnerMetrics - -logger = logging.getLogger(__name__) - -IssuedMetricEventsStats = dict[Type[metric_events.Event], int] - - -class FlushMode(Enum): - """Strategy for flushing runners. - - Attributes: - FLUSH_IDLE: Flush idle runners. - FLUSH_BUSY: Flush busy runners. - """ - - FLUSH_IDLE = auto() - FLUSH_BUSY = auto() - - -@dataclass -class RunnerInstance: - """Represents an instance of runner. - - Attributes: - name: Full name of the runner. Managed by the cloud runner manager. - instance_id: ID of the runner. Managed by the runner manager. - health: The health state of the runner. - github_state: State on github. - cloud_state: State on cloud. - """ - - name: str - instance_id: InstanceId - health: HealthState - github_state: GitHubRunnerState | None - cloud_state: CloudRunnerState - - def __init__(self, cloud_instance: CloudRunnerInstance, github_info: SelfHostedRunner | None): - """Construct an instance. - - Args: - cloud_instance: Information on the cloud instance. - github_info: Information on the GitHub of the runner. - """ - self.name = cloud_instance.name - self.instance_id = cloud_instance.instance_id - self.health = cloud_instance.health - self.github_state = ( - GitHubRunnerState.from_runner(github_info) if github_info is not None else None - ) - self.cloud_state = cloud_instance.state - - -@dataclass -class RunnerManagerConfig: - """Configuration for the runner manager. - - Attributes: - token: GitHub personal access token to query GitHub API. - path: Path to GitHub repository or organization to registry the runners. - """ - - token: str - path: GitHubPath - - -class RunnerManager: - """Manage the runners. - - Attributes: - manager_name: A name to identify this manager. - name_prefix: The name prefix of the runners. - """ - - def __init__( - self, - manager_name: str, - cloud_runner_manager: CloudRunnerManager, - config: RunnerManagerConfig, - ): - """Construct the object. - - Args: - manager_name: A name to identify this manager. - cloud_runner_manager: For managing the cloud instance of the runner. - config: Configuration of this class. - """ - self.manager_name = manager_name - self._config = config - self._cloud = cloud_runner_manager - self.name_prefix = self._cloud.name_prefix - self._github = GitHubRunnerManager( - prefix=self.name_prefix, token=self._config.token, path=self._config.path - ) - - def create_runners(self, num: int) -> tuple[InstanceId]: - """Create runners. - - Args: - num: Number of runners to create. - - Returns: - List of instance ID of the runners. - """ - logger.info("Creating %s runners", num) - registration_token = self._github.get_registration_token() - - create_runner_args = [ - RunnerManager._CreateRunnerArgs(self._cloud, registration_token) for _ in range(num) - ] - return RunnerManager._spawn_runners(create_runner_args) - - def get_runners( - self, - github_states: Sequence[GitHubRunnerState] | None = None, - cloud_states: Sequence[CloudRunnerState] | None = None, - ) -> tuple[RunnerInstance]: - """Get information on runner filter by state. - - Only runners that has cloud instance are returned. - - Args: - github_states: Filter for the runners with these github states. If None all - states will be included. - cloud_states: Filter for the runners with these cloud states. If None all states - will be included. - - Returns: - Information on the runners. - """ - logger.info("Getting runners...") - github_infos = self._github.get_runners(github_states) - cloud_infos = self._cloud.get_runners(cloud_states) - github_infos_map = {info["name"]: info for info in github_infos} - cloud_infos_map = {info.name: info for info in cloud_infos} - logger.info( - "Found following runners: %s", cloud_infos_map.keys() | github_infos_map.keys() - ) - - runner_names = cloud_infos_map.keys() & github_infos_map.keys() - cloud_only = cloud_infos_map.keys() - runner_names - github_only = github_infos_map.keys() - runner_names - if cloud_only: - logger.warning( - "Found runner instance on cloud but not registered on GitHub: %s", cloud_only - ) - if github_only: - logger.warning( - "Found self-hosted runner on GitHub but no matching runner instance on cloud: %s", - github_only, - ) - - runner_instances: list[RunnerInstance] = [ - RunnerInstance( - cloud_infos_map[name], github_infos_map[name] if name in github_infos_map else None - ) - for name in cloud_infos_map.keys() - ] - if cloud_states is not None: - runner_instances = [ - runner for runner in runner_instances if runner.cloud_state in cloud_states - ] - if github_states is not None: - runner_instances = [ - runner - for runner in runner_instances - if runner.github_state is not None and runner.github_state in github_states - ] - return cast(tuple[RunnerInstance], tuple(runner_instances)) - - def delete_runners(self, num: int) -> IssuedMetricEventsStats: - """Delete runners. - - Args: - num: The number of runner to delete. - - Returns: - Stats on metrics events issued during the deletion of runners. - """ - logger.info("Deleting %s number of runners", num) - runners_list = self.get_runners()[:num] - runner_names = [runner.name for runner in runners_list] - logger.info("Deleting runners: %s", runner_names) - remove_token = self._github.get_removal_token() - return self._delete_runners(runners=runners_list, remove_token=remove_token) - - def flush_runners( - self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE - ) -> IssuedMetricEventsStats: - """Delete runners according to state. - - Args: - flush_mode: The type of runners affect by the deletion. - - Returns: - Stats on metrics events issued during the deletion of runners. - """ - match flush_mode: - case FlushMode.FLUSH_IDLE: - logger.info("Flushing idle runners...") - case FlushMode.FLUSH_BUSY: - logger.info("Flushing idle and busy runners...") - case _: - logger.critical( - "Unknown flush mode %s encountered, contact developers", flush_mode - ) - - busy = False - if flush_mode == FlushMode.FLUSH_BUSY: - busy = True - remove_token = self._github.get_removal_token() - stats = self._cloud.flush_runners(remove_token, busy) - return self._issue_runner_metrics(metrics=stats) - - def cleanup(self) -> IssuedMetricEventsStats: - """Run cleanup of the runners and other resources. - - Returns: - Stats on metrics events issued during the cleanup of runners. - """ - self._github.delete_runners([GitHubRunnerState.OFFLINE]) - remove_token = self._github.get_removal_token() - deleted_runner_metrics = self._cloud.cleanup(remove_token) - return self._issue_runner_metrics(metrics=deleted_runner_metrics) - - @staticmethod - def _spawn_runners( - create_runner_args: Sequence["RunnerManager._CreateRunnerArgs"], - ) -> tuple[InstanceId, ...]: - """Parallel spawn of runners. - - The length of the create_runner_args is number _create_runner invocation, and therefore the - number of runner spawned. - - Args: - create_runner_args: List of arg for invoking _create_runner method. - - Returns: - A list of instance ID of runner spawned. - """ - num = len(create_runner_args) - - instance_id_list = [] - with Pool(processes=min(num, 10)) as pool: - jobs = pool.imap_unordered( - func=RunnerManager._create_runner, iterable=create_runner_args - ) - for _ in range(num): - try: - instance_id = next(jobs) - except RunnerCreateError: - logger.exception("Failed to spawn a runner.") - except StopIteration: - break - else: - instance_id_list.append(instance_id) - return tuple(instance_id_list) - - def _delete_runners( - self, runners: Sequence[RunnerInstance], remove_token: str - ) -> IssuedMetricEventsStats: - """Delete list of runners. - - Args: - runners: The runners to delete. - remove_token: The token for removing self-hosted runners. - - Returns: - Stats on metrics events issued during the deletion of runners. - """ - runner_metrics_list = [] - for runner in runners: - deleted_runner_metrics = self._cloud.delete_runner( - instance_id=runner.instance_id, remove_token=remove_token - ) - if deleted_runner_metrics is not None: - runner_metrics_list.append(deleted_runner_metrics) - return self._issue_runner_metrics(metrics=iter(runner_metrics_list)) - - def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetricEventsStats: - """Issue runner metrics. - - Args: - metrics: Runner metrics to issue. - - Returns: - Stats on runner metrics issued. - """ - total_stats: IssuedMetricEventsStats = {} - - for extracted_metrics in metrics: - try: - job_metrics = github_metrics.job( - github_client=self._github.github, - pre_job_metrics=extracted_metrics.pre_job, - runner_name=extracted_metrics.runner_name, - ) - except GithubMetricsError: - logger.exception( - "Failed to calculate job metrics for %s", extracted_metrics.runner_name - ) - job_metrics = None - - issued_events = runner_metrics.issue_events( - runner_metrics=extracted_metrics, - job_metrics=job_metrics, - flavor=self.manager_name, - ) - - for event_type in issued_events: - total_stats[event_type] = total_stats.get(event_type, 0) + 1 - - return total_stats - - @dataclass - class _CreateRunnerArgs: - """Arguments for the _create_runner function. - - Attrs: - cloud_runner_manager: For managing the cloud instance of the runner. - registration_token: The GitHub provided-token for registering runners. - """ - - cloud_runner_manager: CloudRunnerManager - registration_token: str - - @staticmethod - def _create_runner(args: _CreateRunnerArgs) -> InstanceId: - """Create a single runner. - - This is a staticmethod for usage with multiprocess.Pool. - - Args: - args: The arguments. - - Returns: - The instance ID of the runner created. - """ - return args.cloud_runner_manager.create_runner(registration_token=args.registration_token) diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py deleted file mode 100644 index 271b92e51..000000000 --- a/src/manager/runner_scaler.py +++ /dev/null @@ -1,215 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Module for scaling the runners amount.""" - -import logging -import time -from dataclasses import dataclass - -from pydantic import MongoDsn - -import reactive.runner_manager as reactive_runner_manager -from charm_state import ReactiveConfig -from errors import IssueMetricEventError, MissingServerConfigError -from manager.cloud_runner_manager import HealthState -from manager.github_runner_manager import GitHubRunnerState -from manager.runner_manager import FlushMode, RunnerManager -from metrics import events as metric_events - -logger = logging.getLogger(__name__) - - -@dataclass -class RunnerInfo: - """Information on the runners. - - Attributes: - online: The number of runner in online state. - busy: The number of the runner in busy state. - offline: The number of runner in offline state. - unknown: The number of runner in unknown state. - runners: The names of the online runners. - busy_runners: The names of the busy runners. - """ - - online: int - busy: int - offline: int - unknown: int - runners: tuple[str, ...] - busy_runners: tuple[str, ...] - - -class RunnerScaler: - """Manage the reconcile of runners.""" - - def __init__(self, runner_manager: RunnerManager, reactive_config: ReactiveConfig | None): - """Construct the object. - - Args: - runner_manager: The RunnerManager to perform runner reconcile. - reactive_config: Reactive runner configuration. - """ - self._manager = runner_manager - self._reactive_config = reactive_config - - def get_runner_info(self) -> RunnerInfo: - """Get information on the runners. - - Returns: - The information on the runners. - """ - runner_list = self._manager.get_runners() - online = 0 - busy = 0 - offline = 0 - unknown = 0 - online_runners = [] - busy_runners = [] - for runner in runner_list: - match runner.github_state: - case GitHubRunnerState.BUSY: - online += 1 - online_runners.append(runner.name) - busy += 1 - busy_runners.append(runner.name) - case GitHubRunnerState.IDLE: - online += 1 - online_runners.append(runner.name) - case GitHubRunnerState.OFFLINE: - offline += 1 - case _: - unknown += 1 - return RunnerInfo( - online=online, - busy=busy, - offline=offline, - unknown=unknown, - runners=tuple(online_runners), - busy_runners=tuple(busy_runners), - ) - - def flush(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: - """Flush the runners. - - Args: - flush_mode: Determines the types of runner to be flushed. - - Returns: - Number of runners flushed. - """ - metric_stats = self._manager.cleanup() - delete_metric_stats = self._manager.flush_runners(flush_mode=flush_mode) - events = set(delete_metric_stats.keys()) | set(metric_stats.keys()) - metric_stats = { - event_name: delete_metric_stats.get(event_name, 0) + metric_stats.get(event_name, 0) - for event_name in events - } - return metric_stats.get(metric_events.RunnerStop, 0) - - def reconcile(self, quantity: int) -> int: - """Reconcile the quantity of runners. - - Args: - quantity: The number of intended runners. - - Returns: - The Change in number of runners. - """ - logger.info("Start reconcile to %s runner", quantity) - - if self._reactive_config is not None: - logger.info("Reactive configuration detected, going into experimental reactive mode.") - return self._reconcile_reactive(quantity, self._reactive_config.mq_uri) - - start_timestamp = time.time() - delete_metric_stats = None - metric_stats = self._manager.cleanup() - runners = self._manager.get_runners() - logger.info("Reconcile runners from %s to %s", len(runners), quantity) - runner_diff = quantity - len(runners) - if runner_diff > 0: - try: - self._manager.create_runners(runner_diff) - except MissingServerConfigError: - logging.exception( - "Unable to spawn runner due to missing server configuration, such as, image." - ) - elif runner_diff < 0: - delete_metric_stats = self._manager.delete_runners(-runner_diff) - else: - logger.info("No changes to the number of runners.") - end_timestamp = time.time() - - # Merge the two metric stats. - if delete_metric_stats is not None: - metric_stats = { - event_name: delete_metric_stats.get(event_name, 0) - + metric_stats.get(event_name, 0) - for event_name in set(delete_metric_stats) | set(metric_stats) - } - - runner_list = self._manager.get_runners() - busy_runners = [ - runner for runner in runner_list if runner.github_state == GitHubRunnerState.BUSY - ] - idle_runners = [ - runner for runner in runner_list if runner.github_state == GitHubRunnerState.IDLE - ] - offline_healthy_runners = [ - runner - for runner in runner_list - if runner.github_state == GitHubRunnerState.OFFLINE - and runner.health == HealthState.HEALTHY - ] - unhealthy_states = set((HealthState.UNHEALTHY, HealthState.UNKNOWN)) - unhealthy_runners = [runner for runner in runner_list if runner.health in unhealthy_states] - logger.info("Found %s busy runners: %s", len(busy_runners), busy_runners) - logger.info("Found %s idle runners: %s", len(idle_runners), idle_runners) - logger.info( - "Found %s offline runners that are healthy: %s", - len(offline_healthy_runners), - offline_healthy_runners, - ) - logger.info("Found %s unhealthy runners: %s", len(unhealthy_runners), unhealthy_runners) - - try: - available_runners = set(runner.name for runner in idle_runners) | set( - runner.name for runner in offline_healthy_runners - ) - logger.info( - "Current available runners (idle + healthy offline): %s", available_runners - ) - metric_events.issue_event( - metric_events.Reconciliation( - timestamp=time.time(), - flavor=self._manager.manager_name, - crashed_runners=metric_stats.get(metric_events.RunnerStart, 0) - - metric_stats.get(metric_events.RunnerStop, 0), - idle_runners=len(available_runners), - duration=end_timestamp - start_timestamp, - ) - ) - except IssueMetricEventError: - logger.exception("Failed to issue Reconciliation metric") - - return runner_diff - - def _reconcile_reactive(self, quantity: int, mq_uri: MongoDsn) -> int: - """Reconcile runners reactively. - - Args: - quantity: Number of intended runners. - mq_uri: The URI of the MQ to use to spawn runners reactively. - - Returns: - The difference between intended runners and actual runners. In reactive mode - this number is never negative as additional processes should terminate after a timeout. - """ - logger.info("Reactive mode is experimental and not yet fully implemented.") - return reactive_runner_manager.reconcile( - quantity=quantity, - mq_uri=mq_uri, - queue_name=self._manager.manager_name, - ) diff --git a/src/metrics/__init__.py b/src/metrics/__init__.py deleted file mode 100644 index d2a48eaed..000000000 --- a/src/metrics/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Package for common metrics-related code.""" diff --git a/src/metrics/events.py b/src/metrics/events.py deleted file mode 100644 index 6f858166d..000000000 --- a/src/metrics/events.py +++ /dev/null @@ -1,167 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Models and functions for the metric events.""" -import logging -from pathlib import Path -from typing import Any, Optional - -from pydantic import BaseModel, NonNegativeFloat - -from errors import IssueMetricEventError - -METRICS_LOG_PATH = Path("/var/log/github-runner-metrics.log") - - -logger = logging.getLogger(__name__) - - -class Event(BaseModel): - """Base class for metric events. - - Attributes: - timestamp: The UNIX time stamp of the time at which the event was originally issued. - event: The name of the event. Will be set to the class name in snake case if not provided. - """ - - timestamp: NonNegativeFloat - event: str - - @staticmethod - def _camel_to_snake(camel_case_string: str) -> str: - """Convert a camel case string to snake case. - - Args: - camel_case_string: The string to convert. - - Returns: - The converted string. - """ - snake_case_string = camel_case_string[0].lower() - for char in camel_case_string[1:]: - if char.isupper(): - snake_case_string += "_" + char.lower() - else: - snake_case_string += char - return snake_case_string - - def __init__(self, *args: Any, **kwargs: Any): - """Initialize the event. - - Args: - args: The positional arguments to pass to the base class. - kwargs: The keyword arguments to pass to the base class. These are used to set the - specific fields. E.g. timestamp=12345 will set the timestamp field to 12345. - """ - if "event" not in kwargs: - event = self._camel_to_snake(self.__class__.__name__) - kwargs["event"] = event - super().__init__(*args, **kwargs) - - -class RunnerInstalled(Event): - """Metric event for when a runner is installed. - - Attributes: - flavor: Describes the characteristics of the runner. - The flavor could be for example "small". - duration: The duration of the installation in seconds. - """ - - flavor: str - duration: NonNegativeFloat - - -class RunnerStart(Event): - """Metric event for when a runner is started. - - Attributes: - flavor: Describes the characteristics of the runner. - The flavor could be for example "small". - workflow: The workflow name. - repo: The repository name. - github_event: The github event. - idle: The idle time in seconds. - queue_duration: The time in seconds it took before the runner picked up the job. - This is optional as we rely on the Github API and there may be problems - retrieving the data. - """ - - flavor: str - workflow: str - repo: str - github_event: str - idle: NonNegativeFloat - queue_duration: Optional[NonNegativeFloat] - - -class CodeInformation(BaseModel): - """Information about a status code. - - This could e.g. be an exit code or a http status code. - - Attributes: - code: The status code. - """ - - code: int - - -class RunnerStop(Event): - """Metric event for when a runner is stopped. - - Attributes: - flavor: Describes the characteristics of the runner. - The flavor could be for example "small". - workflow: The workflow name. - repo: The repository name. - github_event: The github event. - status: A string describing the reason for stopping the runner. - status_info: More information about the status. - job_duration: The duration of the job in seconds. - job_conclusion: The job conclusion, e.g. "success", "failure", ... - """ - - flavor: str - workflow: str - repo: str - github_event: str - status: str - status_info: Optional[CodeInformation] - job_duration: NonNegativeFloat - job_conclusion: Optional[str] - - -class Reconciliation(Event): - """Metric event for when the charm has finished reconciliation. - - Attributes: - flavor: Describes the characteristics of the runner. - The flavor could be for example "small". - crashed_runners: The number of crashed runners. - idle_runners: The number of idle runners. - duration: The duration of the reconciliation in seconds. - """ - - flavor: str - crashed_runners: int - idle_runners: int - duration: NonNegativeFloat - - -def issue_event(event: Event) -> None: - """Issue a metric event. - - The metric event is logged to the metrics log. - - Args: - event: The metric event to log. - - Raises: - IssueMetricEventError: If the event cannot be logged. - """ - try: - with METRICS_LOG_PATH.open(mode="a", encoding="utf-8") as metrics_file: - metrics_file.write(f"{event.json(exclude_none=True)}\n") - except OSError as exc: - raise IssueMetricEventError(f"Cannot write to {METRICS_LOG_PATH}") from exc diff --git a/src/metrics/github.py b/src/metrics/github.py deleted file mode 100644 index e40574eb7..000000000 --- a/src/metrics/github.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Functions to calculate metrics from data retrieved from GitHub.""" -import logging - -from charm_state import GitHubRepo -from errors import GithubMetricsError, JobNotFoundError -from github_client import GithubClient -from metrics.runner import PreJobMetrics -from metrics.type import GithubJobMetrics - -logger = logging.getLogger(__name__) - - -def job( - github_client: GithubClient, pre_job_metrics: PreJobMetrics, runner_name: str -) -> GithubJobMetrics: - """Calculate the job metrics for a runner. - - The Github API is accessed to retrieve the job data for the runner. - - Args: - github_client: The GitHub API client. - pre_job_metrics: The pre-job metrics. - runner_name: The name of the runner. - - Raises: - GithubMetricsError: If the job for given workflow run is not found. - - Returns: - The job metrics. - """ - owner, repo = pre_job_metrics.repository.split("/", maxsplit=1) - - try: - job_info = github_client.get_job_info( - path=GitHubRepo(owner=owner, repo=repo), - workflow_run_id=pre_job_metrics.workflow_run_id, - runner_name=runner_name, - ) - except JobNotFoundError as exc: - raise GithubMetricsError from exc - logger.debug( - "Job info for runner %s with workflow run id %s: %s", - runner_name, - pre_job_metrics.workflow_run_id, - job_info, - ) - - queue_duration = (job_info.started_at - job_info.created_at).total_seconds() - - return GithubJobMetrics(queue_duration=queue_duration, conclusion=job_info.conclusion) diff --git a/src/metrics/runner.py b/src/metrics/runner.py deleted file mode 100644 index b0ccc191a..000000000 --- a/src/metrics/runner.py +++ /dev/null @@ -1,470 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Classes and function to extract the metrics from storage and issue runner metrics events.""" - -import json -import logging -from enum import Enum -from json import JSONDecodeError -from pathlib import Path -from typing import Iterator, Optional, Type - -from pydantic import BaseModel, Field, NonNegativeFloat, ValidationError - -from errors import ( - CorruptMetricDataError, - DeleteMetricsStorageError, - IssueMetricEventError, - RunnerMetricsError, -) -from metrics import events as metric_events -from metrics.storage import MetricsStorage -from metrics.storage import StorageManager as MetricsStorageManager -from metrics.storage import move_to_quarantine -from metrics.type import GithubJobMetrics - -logger = logging.getLogger(__name__) - -FILE_SIZE_BYTES_LIMIT = 1024 -PRE_JOB_METRICS_FILE_NAME = "pre-job-metrics.json" -POST_JOB_METRICS_FILE_NAME = "post-job-metrics.json" -RUNNER_INSTALLED_TS_FILE_NAME = "runner-installed.timestamp" - - -class PreJobMetrics(BaseModel): - """Metrics for the pre-job phase of a runner. - - Attributes: - timestamp: The UNIX time stamp of the time at which the event was originally issued. - workflow: The workflow name. - workflow_run_id: The workflow run id. - repository: The repository path in the format '/'. - event: The github event. - """ - - timestamp: NonNegativeFloat - workflow: str - workflow_run_id: str - repository: str = Field(None, regex=r"^.+/.+$") - event: str - - -class PostJobStatus(str, Enum): - """The status of the post-job phase of a runner. - - Attributes: - NORMAL: Represents a normal post-job. - ABNORMAL: Represents an error with post-job. - REPO_POLICY_CHECK_FAILURE: Represents an error with repo-policy-compliance check. - """ - - NORMAL = "normal" - ABNORMAL = "abnormal" - REPO_POLICY_CHECK_FAILURE = "repo-policy-check-failure" - - -class CodeInformation(BaseModel): - """Information about a status code. - - Attributes: - code: The status code. - """ - - code: int - - -class PostJobMetrics(BaseModel): - """Metrics for the post-job phase of a runner. - - Attributes: - timestamp: The UNIX time stamp of the time at which the event was originally issued. - status: The status of the job. - status_info: More information about the status. - """ - - timestamp: NonNegativeFloat - status: PostJobStatus - status_info: Optional[CodeInformation] - - -class RunnerMetrics(BaseModel): - """Metrics for a runner. - - Attributes: - installed_timestamp: The UNIX time stamp of the time at which the runner was installed. - pre_job: The metrics for the pre-job phase. - post_job: The metrics for the post-job phase. - runner_name: The name of the runner. - """ - - installed_timestamp: NonNegativeFloat - pre_job: PreJobMetrics - post_job: Optional[PostJobMetrics] - runner_name: str - - -def extract( - metrics_storage_manager: MetricsStorageManager, runners: set[str], include: bool = False -) -> Iterator[RunnerMetrics]: - """Extract metrics from runners. - - The metrics are extracted from the metrics storage of the runners. - Orphan storages are cleaned up. - - If corrupt data is found, the metrics are not processed further and the storage is moved - to a special quarantine directory, as this may indicate that a malicious - runner is trying to manipulate the files on the storage. - - In order to avoid DoS attacks, the file size is also checked. - - Args: - metrics_storage_manager: The metrics storage manager. - runners: The runners to include or exclude. - include: If true the provided runners are included for metric extraction, else the provided - runners are excluded. - - Yields: - Extracted runner metrics of a particular runner. - """ - for ms in metrics_storage_manager.list_all(): - if (include and ms.runner_name in runners) or ( - not include and ms.runner_name not in runners - ): - runner_metrics = _extract_storage( - metrics_storage_manager=metrics_storage_manager, metrics_storage=ms - ) - if not runner_metrics: - logger.warning("Not able to issue metrics for runner %s", ms.runner_name) - else: - yield runner_metrics - - -def issue_events( - runner_metrics: RunnerMetrics, - flavor: str, - job_metrics: Optional[GithubJobMetrics], -) -> set[Type[metric_events.Event]]: - """Issue the metrics events for a runner. - - Args: - runner_metrics: The metrics for the runner. - flavor: The flavor of the runner. - job_metrics: The metrics about the job run by the runner. - - Returns: - A set of issued events. - """ - runner_start_event = _create_runner_start(runner_metrics, flavor, job_metrics) - - issued_events = set() - try: - metric_events.issue_event(runner_start_event) - except ValidationError: - logger.exception( - "Not able to issue RunnerStart metric for " - "runner %s with pre-job metrics %s and job_metrics %s." - "Will not issue RunnerStop metric.", - runner_metrics.runner_name, - runner_metrics.pre_job, - job_metrics, - ) - except IssueMetricEventError: - logger.exception( - "Not able to issue RunnerStart metric for runner %s. " - "Will not issue RunnerStop metric.", - runner_metrics.runner_name, - ) - else: - issued_events = {metric_events.RunnerStart} - - # Return to not issuing RunnerStop metrics if RunnerStart metric could not be issued. - if not issued_events: - return issued_events - - if runner_metrics.post_job: - runner_stop_event = _create_runner_stop(runner_metrics, flavor, job_metrics) - - try: - metric_events.issue_event(runner_stop_event) - except ValidationError: - logger.exception( - "Not able to issue RunnerStop metric for " - "runner %s with pre-job metrics %s, post-job metrics %s and job_metrics %s.", - runner_metrics.runner_name, - runner_metrics.pre_job, - runner_metrics.post_job, - job_metrics, - ) - except IssueMetricEventError: - logger.exception( - "Not able to issue RunnerStop metric for runner %s.", runner_metrics.runner_name - ) - return issued_events - - issued_events.add(metric_events.RunnerStop) - - return issued_events - - -def _create_runner_start( - runner_metrics: RunnerMetrics, flavor: str, job_metrics: Optional[GithubJobMetrics] -) -> metric_events.RunnerStart: - """Create the RunnerStart event. - - Args: - runner_metrics: The metrics for the runner. - flavor: The flavor of the runner. - job_metrics: The metrics about the job run by the runner. - - Returns: - The RunnerStart event. - """ - # When a job gets picked up directly after spawning, the runner_metrics installed timestamp - # might be higher than the pre-job timestamp. This is due to the fact that we issue the runner - # installed timestamp for Openstack after waiting with delays for the runner to be ready. - # We set the idle_duration to 0 in this case. - if runner_metrics.pre_job.timestamp < runner_metrics.installed_timestamp: - logger.warning( - "Pre-job timestamp %d is before installed timestamp %d for runner %s." - " Setting idle_duration to zero", - runner_metrics.pre_job.timestamp, - runner_metrics.installed_timestamp, - runner_metrics.runner_name, - ) - idle_duration = max(runner_metrics.pre_job.timestamp - runner_metrics.installed_timestamp, 0) - - # GitHub API returns started_at < created_at in some rare cases. - if job_metrics and job_metrics.queue_duration < 0: - logger.warning( - "Queue duration for runner %s is negative: %f. Setting it to zero.", - runner_metrics.runner_name, - job_metrics.queue_duration, - ) - queue_duration = max(job_metrics.queue_duration, 0) if job_metrics else None - - return metric_events.RunnerStart( - timestamp=runner_metrics.pre_job.timestamp, - flavor=flavor, - workflow=runner_metrics.pre_job.workflow, - repo=runner_metrics.pre_job.repository, - github_event=runner_metrics.pre_job.event, - idle=idle_duration, - queue_duration=queue_duration, - ) - - -def _create_runner_stop( - runner_metrics: RunnerMetrics, flavor: str, job_metrics: GithubJobMetrics -) -> metric_events.RunnerStop: - """Create the RunnerStop event. - - Expects that the runner_metrics.post_job is not None. - - Args: - runner_metrics: The metrics for the runner. - flavor: The flavor of the runner. - job_metrics: The metrics about the job run by the runner. - - Raises: - RunnerMetricsError: Post job runner metric not found. Should not happen. - - Returns: - The RunnerStop event. - """ - if runner_metrics.post_job is None: - raise RunnerMetricsError( - "Post job runner metric not found during RunnerStop event, contact developers" - ) - - # When a job gets cancelled directly after spawning, - # the post-job timestamp might be lower then the pre-job timestamp. - # This is due to the fact that we don't have a real post-job script but rather use - # the exit code of the runner application which might exit before the pre-job script - # job is done in edge cases. See also: - # https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/running-scripts-before-or-after-a-job#triggering-the-scripts - # We set the job_duration to 0 in this case. - if runner_metrics.post_job.timestamp < runner_metrics.pre_job.timestamp: - logger.warning( - "Post-job timestamp %d is before pre-job timestamp %d for runner %s." - " Setting job_duration to zero", - runner_metrics.post_job.timestamp, - runner_metrics.pre_job.timestamp, - runner_metrics.runner_name, - ) - job_duration = max(runner_metrics.post_job.timestamp - runner_metrics.pre_job.timestamp, 0) - - return metric_events.RunnerStop( - timestamp=runner_metrics.post_job.timestamp, - flavor=flavor, - workflow=runner_metrics.pre_job.workflow, - repo=runner_metrics.pre_job.repository, - github_event=runner_metrics.pre_job.event, - status=runner_metrics.post_job.status, - status_info=runner_metrics.post_job.status_info, - job_duration=job_duration, - job_conclusion=job_metrics.conclusion if job_metrics else None, - ) - - -def _extract_storage( - metrics_storage_manager: MetricsStorageManager, - metrics_storage: MetricsStorage, -) -> Optional[RunnerMetrics]: - """Extract metrics from a metrics storage. - - Args: - metrics_storage_manager: The metrics storage manager. - metrics_storage: The metrics storage for a specific runner. - - Returns: - The extracted metrics if at least the pre-job metrics are present. - """ - runner_name = metrics_storage.runner_name - try: - logger.debug("Extracting metrics from metrics storage for runner %s", runner_name) - metrics_from_fs = _extract_metrics_from_storage(metrics_storage) - except CorruptMetricDataError: - logger.exception("Corrupt metric data found for runner %s", runner_name) - move_to_quarantine(metrics_storage_manager, runner_name) - return None - - logger.debug("Cleaning metrics storage for runner %s", runner_name) - _clean_up_storage( - metrics_storage_manager=metrics_storage_manager, metrics_storage=metrics_storage - ) - return metrics_from_fs - - -def _extract_metrics_from_storage(metrics_storage: MetricsStorage) -> Optional[RunnerMetrics]: - """Extract metrics from metrics storage for a runner. - - Args: - metrics_storage: The metrics storage for a specific runner. - - Returns: - The extracted metrics if at least the pre-job metrics are present. - - Raises: - CorruptMetricDataError: Raised if one of the files is not valid or too large. - """ - if too_large_files := _inspect_file_sizes(metrics_storage): - raise CorruptMetricDataError( - f"File size of {too_large_files} is too large. " - f"The limit is {FILE_SIZE_BYTES_LIMIT} bytes." - ) - - runner_name = metrics_storage.runner_name - try: - installed_timestamp = metrics_storage.path.joinpath( - RUNNER_INSTALLED_TS_FILE_NAME - ).read_text() - logger.debug("Runner %s installed at %s", runner_name, installed_timestamp) - except FileNotFoundError: - logger.exception("installed_timestamp not found for runner %s", runner_name) - return None - - try: - pre_job_metrics = _extract_file_from_storage( - metrics_storage=metrics_storage, filename=PRE_JOB_METRICS_FILE_NAME - ) - if not pre_job_metrics: - return None - logger.debug("Pre-job metrics for runner %s: %s", runner_name, pre_job_metrics) - - post_job_metrics = _extract_file_from_storage( - metrics_storage=metrics_storage, filename=POST_JOB_METRICS_FILE_NAME - ) - logger.debug("Post-job metrics for runner %s: %s", runner_name, post_job_metrics) - # TODO: 2024-04-02 - We should define a new error, wrap it and re-raise it. - except CorruptMetricDataError: # pylint: disable=try-except-raise - raise - - try: - return RunnerMetrics( - installed_timestamp=installed_timestamp, - pre_job=PreJobMetrics(**pre_job_metrics), - post_job=PostJobMetrics(**post_job_metrics) if post_job_metrics else None, - runner_name=runner_name, - ) - except ValidationError as exc: - raise CorruptMetricDataError(str(exc)) from exc - - -def _inspect_file_sizes(metrics_storage: MetricsStorage) -> tuple[Path, ...]: - """Inspect the file sizes of the metrics storage. - - Args: - metrics_storage: The metrics storage for a specific runner. - - Returns: - A tuple of files whose size is larger than the limit. - """ - files: list[Path] = [ - metrics_storage.path.joinpath(PRE_JOB_METRICS_FILE_NAME), - metrics_storage.path.joinpath(POST_JOB_METRICS_FILE_NAME), - metrics_storage.path.joinpath(RUNNER_INSTALLED_TS_FILE_NAME), - ] - - return tuple( - filter(lambda file: file.exists() and file.stat().st_size > FILE_SIZE_BYTES_LIMIT, files) - ) - - -def _extract_file_from_storage(metrics_storage: MetricsStorage, filename: str) -> dict | None: - """Extract a particular metric file from metrics storage. - - Args: - metrics_storage: The metrics storage for a specific runner. - filename: The metrics filename. - - Raises: - CorruptMetricDataError: If any errors have been found within the metric. - - Returns: - Metrics for the given runner if present. - """ - try: - job_metrics = json.loads( - metrics_storage.path.joinpath(filename).read_text(encoding="utf-8") - ) - except FileNotFoundError: - logger.warning("%s not found for runner %s.", filename, metrics_storage.runner_name) - return None - except JSONDecodeError as exc: - raise CorruptMetricDataError(str(exc)) from exc - if not isinstance(job_metrics, dict): - raise CorruptMetricDataError( - f"{filename} metrics for runner {metrics_storage.runner_name} is not a JSON object." - ) - return job_metrics - - -def _clean_up_storage( - metrics_storage_manager: MetricsStorageManager, metrics_storage: MetricsStorage -) -> None: - """Clean up the metrics storage. - - Remove all metric files and afterwards the storage. - - Args: - metrics_storage_manager: The metrics storage manager. - metrics_storage: The metrics storage for a specific runner. - """ - try: - metrics_storage.path.joinpath(RUNNER_INSTALLED_TS_FILE_NAME).unlink(missing_ok=True) - metrics_storage.path.joinpath(PRE_JOB_METRICS_FILE_NAME).unlink(missing_ok=True) - metrics_storage.path.joinpath(POST_JOB_METRICS_FILE_NAME).unlink(missing_ok=True) - except OSError: - logger.exception( - "Could not remove metric files for runner %s, " - "this may lead to duplicate metrics issued", - metrics_storage.runner_name, - ) - - try: - metrics_storage_manager.delete(metrics_storage.runner_name) - except DeleteMetricsStorageError: - logger.exception( - "Could not delete metrics storage for runner %s.", metrics_storage.runner_name - ) diff --git a/src/metrics/runner_logs.py b/src/metrics/runner_logs.py deleted file mode 100644 index ec7923c9c..000000000 --- a/src/metrics/runner_logs.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Functions to pull and remove the logs of the crashed runners.""" - -import logging -import shutil -import time -from datetime import datetime -from pathlib import Path - -RUNNER_LOGS_DIR_PATH = Path("/var/log/github-runner-logs") - -SYSLOG_PATH = Path("/var/log/syslog") - -OUTDATED_LOGS_IN_SECONDS = 7 * 24 * 60 * 60 - -logger = logging.getLogger(__name__) - - -def create_logs_dir(runner_name: str) -> Path: - """Create the directory to store the logs of the crashed runners. - - Args: - runner_name: The name of the runner. - - Returns: - The path to the directory where the logs of the crashed runners will be stored. - """ - target_log_path = RUNNER_LOGS_DIR_PATH / runner_name - target_log_path.mkdir(parents=True, exist_ok=True) - - return target_log_path - - -def remove_outdated() -> None: - """Remove the logs that are too old.""" - maxage_absolute = time.time() - OUTDATED_LOGS_IN_SECONDS - dt_object = datetime.fromtimestamp(maxage_absolute) - logger.info( - "Removing the outdated logs of the crashed runners. " - "All logs older than %s will be removed.", - dt_object.strftime("%Y-%m-%d %H:%M:%S"), - ) - - for log_path in RUNNER_LOGS_DIR_PATH.glob("*"): - if log_path.is_dir() and (log_path.stat().st_mtime < maxage_absolute): - logger.info("Removing the outdated logs of the runner %s.", log_path.name) - try: - shutil.rmtree(log_path) - except OSError: - logger.exception( - "Unable to remove the outdated logs of the runner %s.", log_path.name - ) diff --git a/src/metrics/storage.py b/src/metrics/storage.py deleted file mode 100644 index c9b41a2f5..000000000 --- a/src/metrics/storage.py +++ /dev/null @@ -1,192 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Classes and functions defining the metrics storage. - -It contains a protocol and reference implementation. -""" -import logging -import shutil -import tarfile -from dataclasses import dataclass -from pathlib import Path -from typing import Callable, Iterator, Protocol - -from errors import ( - CreateMetricsStorageError, - DeleteMetricsStorageError, - GetMetricsStorageError, - QuarantineMetricsStorageError, -) - -FILESYSTEM_OWNER = "ubuntu:ubuntu" -FILESYSTEM_BASE_PATH = Path("/home/ubuntu/runner-fs") -FILESYSTEM_QUARANTINE_PATH = Path("/home/ubuntu/runner-fs-quarantine") - -logger = logging.getLogger(__name__) - - -@dataclass -class MetricsStorage: - """Storage for the metrics. - - Attributes: - path: The path to the directory holding the metrics inside the charm. - runner_name: The name of the associated runner. - """ - - path: Path - runner_name: str - - -class StorageManager(Protocol): # pylint: disable=too-few-public-methods - """A protocol defining the methods for managing the metrics storage. - - Attributes: - create: Method to create a new storage. Returns the created storage. - Raises an exception CreateMetricsStorageError if the storage already exists. - list_all: Method to list all storages. - get: Method to get a storage by name. - delete: Method to delete a storage by name. - """ - - create: Callable[[str], MetricsStorage] - list_all: Callable[[], Iterator[MetricsStorage]] - get: Callable[[str], MetricsStorage] - delete: Callable[[str], None] - - -def _get_runner_fs_path(runner_name: str) -> Path: - """Get the path of the runner shared filesystem. - - Args: - runner_name: The name of the runner. - - Returns: - The path of the runner shared filesystem. - """ - return FILESYSTEM_BASE_PATH / runner_name - - -def create(runner_name: str) -> MetricsStorage: - """Create metrics storage for the runner. - - The method is not idempotent and will raise an exception - if the storage already exists. - - Args: - runner_name: The name of the runner. - - Returns: - The metrics storage object. - - Raises: - CreateMetricsStorageError: If the creation of the shared filesystem fails. - """ - try: - FILESYSTEM_BASE_PATH.mkdir(exist_ok=True) - FILESYSTEM_QUARANTINE_PATH.mkdir(exist_ok=True) - except OSError as exc: - raise CreateMetricsStorageError("Failed to create metrics storage directories") from exc - - runner_fs_path = _get_runner_fs_path(runner_name) - - try: - runner_fs_path.mkdir() - except FileExistsError as exc: - raise CreateMetricsStorageError( - f"Metrics storage for runner {runner_name} already exists." - ) from exc - - return MetricsStorage(runner_fs_path, runner_name) - - -def list_all() -> Iterator[MetricsStorage]: - """List all the metric storages. - - Yields: - A metrics storage object. - """ - if not FILESYSTEM_BASE_PATH.exists(): - return - - directories = (entry for entry in FILESYSTEM_BASE_PATH.iterdir() if entry.is_dir()) - for directory in directories: - try: - fs = get(runner_name=directory.name) - except GetMetricsStorageError: - logger.error("Failed to get metrics storage for runner %s", directory.name) - else: - yield fs - - -def get(runner_name: str) -> MetricsStorage: - """Get the metrics storage for the runner. - - Args: - runner_name: The name of the runner. - - Returns: - The metrics storage object. - - Raises: - GetMetricsStorageError: If the storage does not exist. - """ - runner_fs_path = _get_runner_fs_path(runner_name) - if not runner_fs_path.exists(): - raise GetMetricsStorageError(f"Metrics storage for runner {runner_name} not found.") - - return MetricsStorage(runner_fs_path, runner_name) - - -def delete(runner_name: str) -> None: - """Delete the metrics storage for the runner. - - Args: - runner_name: The name of the runner. - - Raises: - DeleteMetricsStorageError: If the storage could not be deleted. - """ - runner_fs_path = _get_runner_fs_path(runner_name=runner_name) - - try: - shutil.rmtree(runner_fs_path) - except OSError as exc: - raise DeleteMetricsStorageError( - f"Failed to remove metrics storage for runner {runner_name}" - ) from exc - - -def move_to_quarantine(storage_manager: StorageManager, runner_name: str) -> None: - """Archive the metrics storage for the runner and delete it. - - Args: - storage_manager: The storage manager. - runner_name: The name of the runner. - - Raises: - QuarantineMetricsStorageError: If the metrics storage could not be quarantined. - """ - try: - runner_fs = storage_manager.get(runner_name) - except GetMetricsStorageError as exc: - raise QuarantineMetricsStorageError( - f"Failed to get metrics storage for runner {runner_name}" - ) from exc - - tarfile_path = FILESYSTEM_QUARANTINE_PATH.joinpath(runner_name).with_suffix(".tar.gz") - try: - with tarfile.open(tarfile_path, "w:gz") as tar: - tar.add(runner_fs.path, arcname=runner_fs.path.name) - except OSError as exc: - raise QuarantineMetricsStorageError( - f"Failed to archive metrics storage for runner {runner_name}" - ) from exc - - try: - storage_manager.delete(runner_name) - except DeleteMetricsStorageError as exc: - raise QuarantineMetricsStorageError( - f"Failed to delete metrics storage for runner {runner_name}" - ) from exc diff --git a/src/metrics/type.py b/src/metrics/type.py deleted file mode 100644 index fd45314f6..000000000 --- a/src/metrics/type.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -# Copyright 2023 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Data types used by modules handling metrics.""" - -from typing import NamedTuple, Optional - -from github_type import JobConclusion - - -class GithubJobMetrics(NamedTuple): - """Metrics about a job. - - Attributes: - queue_duration: The time in seconds the job took before the runner picked it up. - conclusion: The conclusion of the job. - """ - - queue_duration: float - conclusion: Optional[JobConclusion] diff --git a/src/openstack_cloud/__init__.py b/src/openstack_cloud/__init__.py deleted file mode 100644 index 3f9935aab..000000000 --- a/src/openstack_cloud/__init__.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Module for managing Openstack cloud.""" - -import logging -from pathlib import Path -from typing import TypedDict, cast - -import yaml - -from errors import OpenStackInvalidConfigError - -logger = logging.getLogger(__name__) - - -CLOUDS_YAML_PATH = Path(Path.home() / ".config/openstack/clouds.yaml") - - -class CloudConfig(TypedDict): - """The parsed clouds.yaml configuration dictionary. - - Attributes: - clouds: A mapping of key "clouds" to cloud name mapped to cloud configuration. - """ - - clouds: dict[str, dict] - - -def _validate_cloud_config(cloud_config: dict) -> CloudConfig: - """Validate the format of the cloud configuration. - - Args: - cloud_config: The configuration in clouds.yaml format to validate. - - Raises: - OpenStackInvalidConfigError: if the format of the config is invalid. - - Returns: - A typed cloud_config dictionary. - """ - # dict of format: {clouds: : } - try: - clouds = list(cloud_config["clouds"].keys()) - except KeyError as exc: - raise OpenStackInvalidConfigError("Missing key 'clouds' from config.") from exc - if not clouds: - raise OpenStackInvalidConfigError("No clouds defined in clouds.yaml.") - return cast(CloudConfig, cloud_config) - - -def _write_config_to_disk(cloud_config: CloudConfig) -> None: - """Write the cloud configuration to disk. - - Args: - cloud_config: The configuration in clouds.yaml format to write to disk. - """ - CLOUDS_YAML_PATH.parent.mkdir(parents=True, exist_ok=True) - CLOUDS_YAML_PATH.write_text(encoding="utf-8", data=yaml.dump(cloud_config)) - - -def initialize(cloud_config: dict) -> None: - """Initialize Openstack integration. - - Validates config and writes it to disk. - - Raises: - OpenStackInvalidConfigError: If there was an given cloud config. - - Args: - cloud_config: The configuration in clouds.yaml format to apply. - """ - try: - valid_config = _validate_cloud_config(cloud_config) - # TODO: 2024-04-02 - We should define a new error, wrap it and re-raise it. - except OpenStackInvalidConfigError: # pylint: disable=try-except-raise - raise - _write_config_to_disk(valid_config) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py deleted file mode 100644 index ad21f4d97..000000000 --- a/src/openstack_cloud/openstack_cloud.py +++ /dev/null @@ -1,597 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Class for accessing OpenStack API for managing servers.""" - -import logging -from contextlib import contextmanager -from dataclasses import dataclass -from datetime import datetime -from functools import reduce -from pathlib import Path -from typing import Iterable, Iterator, cast - -import openstack -import openstack.exceptions -import paramiko -import yaml -from fabric import Connection as SSHConnection -from openstack.compute.v2.keypair import Keypair as OpenstackKeypair -from openstack.compute.v2.server import Server as OpenstackServer -from openstack.connection import Connection as OpenstackConnection -from openstack.network.v2.security_group import SecurityGroup as OpenstackSecurityGroup -from paramiko.ssh_exception import NoValidConnectionsError - -from errors import KeyfileError, OpenStackError, SSHError -from utilities import retry - -logger = logging.getLogger(__name__) - -_CLOUDS_YAML_PATH = Path.home() / ".config/openstack/clouds.yaml" - -# Update the version when the security group rules are not backward compatible. -_SECURITY_GROUP_NAME = "github-runner-v1" - -_CREATE_SERVER_TIMEOUT = 5 * 60 -_SSH_TIMEOUT = 30 -_SSH_KEY_PATH = Path("/home/ubuntu/.ssh") -_TEST_STRING = "test_string" - - -@dataclass -class OpenstackInstance: - """Represents an OpenStack instance. - - Attributes: - server_id: ID of server assigned by OpenStack. - server_name: Name of the server on OpenStack. - instance_id: ID used by OpenstackCloud class to manage the instances. See docs on the - OpenstackCloud. - addresses: IP addresses assigned to the server. - status: Status of the server. - """ - - server_id: str - server_name: str - instance_id: str - addresses: list[str] - status: str - - def __init__(self, server: OpenstackServer, prefix: str): - """Construct the object. - - Args: - server: The OpenStack server. - prefix: The name prefix for the servers. - - Raises: - ValueError: Provided server should not be managed under this prefix. - """ - self.server_id = server.id - self.server_name = server.name - self.status = server.status - self.addresses = [ - address["addr"] - for network_addresses in server.addresses.values() - for address in network_addresses - ] - - if not self.server_name.startswith(f"{prefix}-"): - # Should never happen. - raise ValueError( - f"Found openstack server {server.name} managed under prefix {prefix}, contact devs" - ) - self.instance_id = self.server_name[len(prefix) + 1 :] - - -@contextmanager -@retry(tries=2, delay=5, local_logger=logger) -def _get_openstack_connection( - clouds_config: dict[str, dict], cloud: str -) -> Iterator[OpenstackConnection]: - """Create a connection context managed object, to be used within with statements. - - The file of _CLOUDS_YAML_PATH should only be modified by this function. - - Args: - clouds_config: The configuration in clouds.yaml format to apply. - cloud: The name of cloud to use in the clouds.yaml. - - Raises: - OpenStackError: if the credentials provided is not authorized. - - Yields: - An openstack.connection.Connection object. - """ - if not _CLOUDS_YAML_PATH.exists(): - _CLOUDS_YAML_PATH.parent.mkdir(parents=True, exist_ok=True) - - # Concurrency: Very small chance for the file to be corrupted due to multiple process calling - # this function and writing the file at the same time. This should cause the `conn.authorize` - # to fail, and retry of this function would resolve this. - _CLOUDS_YAML_PATH.write_text(data=yaml.dump(clouds_config), encoding="utf-8") - - # api documents that keystoneauth1.exceptions.MissingRequiredOptions can be raised but - # I could not reproduce it. Therefore, no catch here for such exception. - try: - with openstack.connect(cloud=cloud) as conn: - conn.authorize() - yield conn - # pylint thinks this isn't an exception, but does inherit from Exception class. - except openstack.exceptions.HttpException as exc: # pylint: disable=bad-exception-cause - logger.exception("OpenStack API call failure") - raise OpenStackError("Failed OpenStack API call") from exc - - -class OpenstackCloud: - """Client to interact with OpenStack cloud. - - The OpenStack server name is managed by this cloud. Caller refers to the instances via - instance_id. If the caller needs the server name, e.g., for logging, it can be queried with - get_server_name. - """ - - def __init__(self, clouds_config: dict[str, dict], cloud: str, prefix: str): - """Create the object. - - Args: - clouds_config: The openstack clouds.yaml in dict format. - cloud: The name of cloud to use in the clouds.yaml. - prefix: Prefix attached to names of resource managed by this instance. Used for - identifying which resource belongs to this instance. - """ - self._clouds_config = clouds_config - self._cloud = cloud - self.prefix = prefix - - # Ignore "Too many arguments" as 6 args should be fine. Move to a dataclass if new args are - # added. - def launch_instance( # pylint: disable=R0913 - self, instance_id: str, image: str, flavor: str, network: str, cloud_init: str - ) -> OpenstackInstance: - """Create an OpenStack instance. - - Args: - instance_id: The instance ID to form the instance name. - image: The image used to create the instance. - flavor: The flavor used to create the instance. - network: The network used to create the instance. - cloud_init: The cloud init userdata to startup the instance. - - Raises: - OpenStackError: Unable to create OpenStack server. - - Returns: - The OpenStack instance created. - """ - full_name = self.get_server_name(instance_id) - logger.info("Creating openstack server with %s", full_name) - - with _get_openstack_connection( - clouds_config=self._clouds_config, cloud=self._cloud - ) as conn: - security_group = OpenstackCloud._ensure_security_group(conn) - keypair = OpenstackCloud._setup_keypair(conn, full_name) - - try: - server = conn.create_server( - name=full_name, - image=image, - key_name=keypair.name, - flavor=flavor, - network=network, - security_groups=[security_group.id], - userdata=cloud_init, - auto_ip=False, - timeout=_CREATE_SERVER_TIMEOUT, - wait=True, - ) - except openstack.exceptions.ResourceTimeout as err: - logger.exception("Timeout creating openstack server %s", full_name) - logger.info( - "Attempting clean up of openstack server %s that timeout during creation", - full_name, - ) - self._delete_instance(conn, full_name) - raise OpenStackError(f"Timeout creating openstack server {full_name}") from err - except openstack.exceptions.SDKException as err: - logger.exception("Failed to create openstack server %s", full_name) - self._delete_keypair(conn, instance_id) - raise OpenStackError(f"Failed to create openstack server {full_name}") from err - - return OpenstackInstance(server, self.prefix) - - def get_instance(self, instance_id: str) -> OpenstackInstance | None: - """Get OpenStack instance by instance ID. - - Args: - instance_id: The instance ID. - - Returns: - The OpenStack instance if found. - """ - full_name = self.get_server_name(instance_id) - logger.info("Getting openstack server with %s", full_name) - - with _get_openstack_connection( - clouds_config=self._clouds_config, cloud=self._cloud - ) as conn: - server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) - if server is not None: - return OpenstackInstance(server, self.prefix) - return None - - def delete_instance(self, instance_id: str) -> None: - """Delete a openstack instance. - - Args: - instance_id: The instance ID of the instance to delete. - """ - full_name = self.get_server_name(instance_id) - logger.info("Deleting openstack server with %s", full_name) - - with _get_openstack_connection( - clouds_config=self._clouds_config, cloud=self._cloud - ) as conn: - self._delete_instance(conn, full_name) - - def _delete_instance(self, conn: OpenstackConnection, full_name: str) -> None: - """Delete a openstack instance. - - Raises: - OpenStackError: Unable to delete OpenStack server. - - Args: - conn: The openstack connection to use. - full_name: The full name of the server. - """ - try: - server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) - if server is not None: - conn.delete_server(name_or_id=server.id) - OpenstackCloud._delete_keypair(conn, full_name) - except ( - openstack.exceptions.SDKException, - openstack.exceptions.ResourceTimeout, - ) as err: - raise OpenStackError(f"Failed to remove openstack runner {full_name}") from err - - def get_ssh_connection(self, instance: OpenstackInstance) -> SSHConnection: - """Get SSH connection to an OpenStack instance. - - Args: - instance: The OpenStack instance to connect to. - - Raises: - SSHError: Unable to get a working SSH connection to the instance. - KeyfileError: Unable to find the keyfile to connect to the instance. - - Returns: - SSH connection object. - """ - key_path = OpenstackCloud._get_key_path(instance.server_name) - - if not key_path.exists(): - raise KeyfileError( - f"Missing keyfile for server: {instance.server_name}, key path: {key_path}" - ) - if not instance.addresses: - raise SSHError(f"No addresses found for OpenStack server {instance.server_name}") - - for ip in instance.addresses: - try: - connection = SSHConnection( - host=ip, - user="ubuntu", - connect_kwargs={"key_filename": str(key_path)}, - connect_timeout=_SSH_TIMEOUT, - ) - result = connection.run(f"echo {_TEST_STRING}", warn=True, timeout=_SSH_TIMEOUT) - if not result.ok: - logger.warning( - "SSH test connection failed, server: %s, address: %s", - instance.server_name, - ip, - ) - continue - if _TEST_STRING in result.stdout: - return connection - except (NoValidConnectionsError, TimeoutError, paramiko.ssh_exception.SSHException): - logger.warning( - "Unable to SSH into %s with address %s", - instance.server_name, - connection.host, - exc_info=True, - ) - continue - raise SSHError( - f"No connectable SSH addresses found, server: {instance.server_name}, " - f"addresses: {instance.addresses}" - ) - - def get_instances(self) -> tuple[OpenstackInstance, ...]: - """Get all OpenStack instances. - - Returns: - The OpenStack instances. - """ - logger.info("Getting all openstack servers managed by the charm") - - with _get_openstack_connection( - clouds_config=self._clouds_config, cloud=self._cloud - ) as conn: - instance_list = self._get_openstack_instances(conn) - server_names = set(server.name for server in instance_list) - - server_list = [ - OpenstackCloud._get_and_ensure_unique_server(conn, name) for name in server_names - ] - return tuple( - OpenstackInstance(server, self.prefix) - for server in server_list - if server is not None - ) - - def cleanup(self) -> None: - """Cleanup unused key files and openstack keypairs.""" - with _get_openstack_connection( - clouds_config=self._clouds_config, cloud=self._cloud - ) as conn: - instances = self._get_openstack_instances(conn) - exclude_list = [server.name for server in instances] - self._cleanup_key_files(exclude_list) - self._cleanup_openstack_keypairs(conn, exclude_list) - - def get_server_name(self, instance_id: str) -> str: - """Get server name on OpenStack. - - Args: - instance_id: ID used to identify a instance. - - Returns: - The OpenStack server name. - """ - return f"{self.prefix}-{instance_id}" - - def _cleanup_key_files(self, exclude_instances: Iterable[str]) -> None: - """Delete all SSH key files except the specified instances. - - Args: - exclude_instances: The keys of these instance will not be deleted. - """ - logger.info("Cleaning up SSH key files") - exclude_filename = set( - OpenstackCloud._get_key_path(instance) for instance in exclude_instances - ) - - total = 0 - deleted = 0 - for path in _SSH_KEY_PATH.iterdir(): - # Find key file from this application. - if path.is_file() and path.name.startswith(self.prefix) and path.name.endswith(".key"): - total += 1 - if path in exclude_filename: - continue - path.unlink() - deleted += 1 - logger.info("Found %s key files, clean up %s key files", total, deleted) - - def _cleanup_openstack_keypairs( - self, conn: OpenstackConnection, exclude_instances: Iterable[str] - ) -> None: - """Delete all OpenStack keypairs except the specified instances. - - Args: - conn: The Openstack connection instance. - exclude_instances: The keys of these instance will not be deleted. - """ - logger.info("Cleaning up openstack keypairs") - exclude_instance_set = set(exclude_instances) - keypairs = conn.list_keypairs() - for key in keypairs: - # The `name` attribute is of resource.Body type. - if key.name and str(key.name).startswith(self.prefix): - if str(key.name) in exclude_instance_set: - continue - try: - self._delete_keypair(conn, key.name) - except openstack.exceptions.SDKException: - logger.warning( - "Unable to delete OpenStack keypair associated with deleted key file %s ", - key.name, - ) - - def _get_openstack_instances(self, conn: OpenstackConnection) -> tuple[OpenstackServer, ...]: - """Get the OpenStack servers managed by this unit. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - List of OpenStack instances. - """ - return tuple( - server - for server in cast(list[OpenstackServer], conn.list_servers()) - if server.name.startswith(f"{self.prefix}-") - ) - - @staticmethod - def _get_and_ensure_unique_server( - conn: OpenstackConnection, name: str - ) -> OpenstackServer | None: - """Get the latest server of the name and ensure it is unique. - - If multiple servers with the same name are found, the latest server in creation time is - returned. Other servers is deleted. - - Args: - conn: The connection to OpenStack. - name: The name of the OpenStack name. - - Returns: - A server with the name. - """ - servers: list[OpenstackServer] = conn.search_servers(name) - - if not servers: - return None - - # 2024/08/14: The `format` arg for `strptime` is the default format. - # This is only provided to get around a bug of the function with type checking. - latest_server = reduce( - lambda a, b: ( - a - if datetime.strptime(a.created_at, "a %b %d %H:%M:%S %Y") - < datetime.strptime(b.create_at, "a %b %d %H:%M:%S %Y") - else b - ), - servers, - ) - outdated_servers = filter(lambda x: x != latest_server, servers) - for server in outdated_servers: - try: - conn.delete_server(name_or_id=server.id) - except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout): - logger.warning( - "Unable to delete server with duplicate name %s with ID %s", - name, - server.id, - stack_info=True, - ) - - return latest_server - - @staticmethod - def _get_key_path(name: str) -> Path: - """Get the filepath for storing private SSH of a runner. - - Args: - name: The name of the runner. - - Returns: - Path to reserved for the key file of the runner. - """ - return _SSH_KEY_PATH / f"{name}.key" - - @staticmethod - def _setup_keypair(conn: OpenstackConnection, name: str) -> OpenstackKeypair: - """Create OpenStack keypair. - - Args: - conn: The connection object to access OpenStack cloud. - name: The name of the keypair. - - Returns: - The OpenStack keypair. - """ - key_path = OpenstackCloud._get_key_path(name) - - if key_path.exists(): - logger.warning("Existing private key file for %s found, removing it.", name) - key_path.unlink(missing_ok=True) - - keypair = conn.create_keypair(name=name) - key_path.parent.mkdir(parents=True, exist_ok=True) - key_path.write_text(keypair.private_key) - key_path.chmod(0o400) - return keypair - - @staticmethod - def _delete_keypair(conn: OpenstackConnection, name: str) -> None: - """Delete OpenStack keypair. - - Args: - conn: The connection object to access OpenStack cloud. - name: The name of the keypair. - """ - try: - # Keypair have unique names, access by ID is not needed. - if not conn.delete_keypair(name): - logger.warning("Unable to delete keypair for %s", name) - except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout): - logger.warning("Unable to delete keypair for %s", name, stack_info=True) - - key_path = OpenstackCloud._get_key_path(name) - key_path.unlink(missing_ok=True) - - @staticmethod - def _ensure_security_group(conn: OpenstackConnection) -> OpenstackSecurityGroup: - """Ensure runner security group exists. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - The security group with the rules for runners. - """ - rule_exists_icmp = False - rule_exists_ssh = False - rule_exists_tmate_ssh = False - - security_group_list = conn.list_security_groups(filters={"name": _SECURITY_GROUP_NAME}) - # Pick the first security_group returned. - security_group = next(iter(security_group_list), None) - if security_group is None: - logger.info("Security group %s not found, creating it", _SECURITY_GROUP_NAME) - security_group = conn.create_security_group( - name=_SECURITY_GROUP_NAME, - description="For servers managed by the github-runner charm.", - ) - else: - existing_rules = security_group.security_group_rules - for rule in existing_rules: - if rule["protocol"] == "icmp": - logger.debug( - "Found ICMP rule in existing security group %s of ID %s", - _SECURITY_GROUP_NAME, - security_group.id, - ) - rule_exists_icmp = True - if ( - rule["protocol"] == "tcp" - and rule["port_range_min"] == rule["port_range_max"] == 22 - ): - logger.debug( - "Found SSH rule in existing security group %s of ID %s", - _SECURITY_GROUP_NAME, - security_group.id, - ) - rule_exists_ssh = True - if ( - rule["protocol"] == "tcp" - and rule["port_range_min"] == rule["port_range_max"] == 10022 - ): - logger.debug( - "Found tmate SSH rule in existing security group %s of ID %s", - _SECURITY_GROUP_NAME, - security_group.id, - ) - rule_exists_tmate_ssh = True - - if not rule_exists_icmp: - conn.create_security_group_rule( - secgroup_name_or_id=security_group.id, - protocol="icmp", - direction="ingress", - ethertype="IPv4", - ) - if not rule_exists_ssh: - conn.create_security_group_rule( - secgroup_name_or_id=security_group.id, - port_range_min="22", - port_range_max="22", - protocol="tcp", - direction="ingress", - ethertype="IPv4", - ) - if not rule_exists_tmate_ssh: - conn.create_security_group_rule( - secgroup_name_or_id=security_group.id, - port_range_min="10022", - port_range_max="10022", - protocol="tcp", - direction="egress", - ethertype="IPv4", - ) - return security_group diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py deleted file mode 100644 index 11bac0b92..000000000 --- a/src/openstack_cloud/openstack_runner_manager.py +++ /dev/null @@ -1,830 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Manager for self-hosted runner on OpenStack.""" - -import logging -import secrets -import time -from dataclasses import dataclass -from pathlib import Path -from typing import Iterator, Sequence - -import invoke -import jinja2 -import paramiko -import paramiko.ssh_exception -from fabric import Connection as SSHConnection - -from charm_state import GitHubOrg -from errors import ( - CreateMetricsStorageError, - GetMetricsStorageError, - IssueMetricEventError, - KeyfileError, - MissingServerConfigError, - OpenStackError, - RunnerCreateError, - RunnerStartError, - SSHError, -) -from manager.cloud_runner_manager import ( - CloudRunnerInstance, - CloudRunnerManager, - CloudRunnerState, - GitHubRunnerConfig, - InstanceId, - SupportServiceConfig, -) -from manager.runner_manager import HealthState -from metrics import events as metric_events -from metrics import runner as runner_metrics -from metrics import storage as metrics_storage -from openstack_cloud.openstack_cloud import OpenstackCloud, OpenstackInstance -from repo_policy_compliance_client import RepoPolicyComplianceClient -from utilities import retry, set_env_var - -logger = logging.getLogger(__name__) - -BUILD_OPENSTACK_IMAGE_SCRIPT_FILENAME = "scripts/build-openstack-image.sh" -_CONFIG_SCRIPT_PATH = Path("/home/ubuntu/actions-runner/config.sh") - -RUNNER_APPLICATION = Path("/home/ubuntu/actions-runner") -METRICS_EXCHANGE_PATH = Path("/home/ubuntu/metrics-exchange") -PRE_JOB_SCRIPT = RUNNER_APPLICATION / "pre-job.sh" -MAX_METRICS_FILE_SIZE = 1024 - -RUNNER_STARTUP_PROCESS = "/home/ubuntu/actions-runner/run.sh" -RUNNER_LISTENER_PROCESS = "Runner.Listener" -RUNNER_WORKER_PROCESS = "Runner.Worker" -CREATE_SERVER_TIMEOUT = 5 * 60 - - -class _GithubRunnerRemoveError(Exception): - """Represents an error while SSH into a runner and running the remove script.""" - - -class _PullFileError(Exception): - """Represents an error while pulling a file from the runner instance.""" - - -@dataclass -class OpenStackCloudConfig: - """Configuration for OpenStack cloud authorisation information. - - Attributes: - clouds_config: The clouds.yaml. - cloud: The cloud name to connect to. - """ - - clouds_config: dict[str, dict] - cloud: str - - -@dataclass -class OpenStackServerConfig: - """Configuration for OpenStack server. - - Attributes: - image: The image name for runners to use. - flavor: The flavor name for runners to use. - network: The network name for runners to use. - """ - - image: str - flavor: str - network: str - - -@dataclass -class _RunnerHealth: - """Runners with health state. - - Attributes: - healthy: The list of healthy runners. - unhealthy: The list of unhealthy runners. - """ - - healthy: tuple[OpenstackInstance, ...] - unhealthy: tuple[OpenstackInstance, ...] - - -class OpenStackRunnerManager(CloudRunnerManager): - """Manage self-hosted runner on OpenStack cloud. - - Attributes: - name_prefix: The name prefix of the runners created. - """ - - # Ignore "Too many arguments", as the class requires a lot of configurations. - def __init__( # pylint: disable=R0913 - self, - manager_name: str, - prefix: str, - cloud_config: OpenStackCloudConfig, - server_config: OpenStackServerConfig | None, - runner_config: GitHubRunnerConfig, - service_config: SupportServiceConfig, - ) -> None: - """Construct the object. - - Args: - manager_name: A name to identify this manager. - prefix: The prefix to runner name. - cloud_config: The configuration for OpenStack authorisation. - server_config: The configuration for creating OpenStack server. Unable to create - runner if None. - runner_config: The configuration for the runner. - service_config: The configuration of supporting services of the runners. - """ - self._manager_name = manager_name - self._prefix = prefix - self._cloud_config = cloud_config - self._server_config = server_config - self._runner_config = runner_config - self._service_config = service_config - self._openstack_cloud = OpenstackCloud( - clouds_config=self._cloud_config.clouds_config, - cloud=self._cloud_config.cloud, - prefix=self.name_prefix, - ) - - # Setting the env var to this process and any child process spawned. - proxies = service_config.proxy_config - if no_proxy := proxies.no_proxy: - set_env_var("NO_PROXY", no_proxy) - if http_proxy := proxies.http: - set_env_var("HTTP_PROXY", http_proxy) - if https_proxy := proxies.https: - set_env_var("HTTPS_PROXY", https_proxy) - - @property - def name_prefix(self) -> str: - """The prefix of runner names. - - Returns: - The prefix of the runner names managed by this class. - """ - return self._prefix - - def create_runner(self, registration_token: str) -> InstanceId: - """Create a self-hosted runner. - - Args: - registration_token: The GitHub registration token for registering runners. - - Raises: - MissingServerConfigError: Unable to create runner due to missing configuration. - RunnerCreateError: Unable to create runner due to OpenStack issues. - - Returns: - Instance ID of the runner. - """ - if self._server_config is None: - raise MissingServerConfigError("Missing server configuration to create runners") - - start_timestamp = time.time() - instance_id = OpenStackRunnerManager._generate_instance_id() - instance_name = self._openstack_cloud.get_server_name(instance_id=instance_id) - cloud_init = self._generate_cloud_init( - instance_name=instance_name, registration_token=registration_token - ) - try: - instance = self._openstack_cloud.launch_instance( - instance_id=instance_id, - image=self._server_config.image, - flavor=self._server_config.flavor, - network=self._server_config.network, - cloud_init=cloud_init, - ) - except OpenStackError as err: - raise RunnerCreateError(f"Failed to create {instance_name} openstack runner") from err - - self._wait_runner_startup(instance) - self._wait_runner_running(instance) - - end_timestamp = time.time() - OpenStackRunnerManager._issue_runner_installed_metric( - name=instance_name, - flavor=self._manager_name, - install_start_timestamp=start_timestamp, - install_end_timestamp=end_timestamp, - ) - return instance_id - - def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: - """Get a self-hosted runner by instance id. - - Args: - instance_id: The instance id. - - Returns: - Information on the runner instance. - """ - instance = self._openstack_cloud.get_instance(instance_id) - healthy = self._runner_health_check(instance=instance) - return ( - CloudRunnerInstance( - name=instance.server_name, - instance_id=instance_id, - health=HealthState.HEALTHY if healthy else HealthState.UNHEALTHY, - state=CloudRunnerState.from_openstack_server_status(instance.status), - ) - if instance is not None - else None - ) - - def get_runners( - self, states: Sequence[CloudRunnerState] | None = None - ) -> tuple[CloudRunnerInstance, ...]: - """Get self-hosted runners by state. - - Args: - states: Filter for the runners with these github states. If None all states will be - included. - - Returns: - Information on the runner instances. - """ - instance_list = self._openstack_cloud.get_instances() - instance_list = [ - CloudRunnerInstance( - name=instance.server_name, - instance_id=instance.instance_id, - health=( - HealthState.HEALTHY - if self._runner_health_check(instance) - else HealthState.UNHEALTHY - ), - state=CloudRunnerState.from_openstack_server_status(instance.status), - ) - for instance in instance_list - ] - if states is None: - return tuple(instance_list) - - state_set = set(states) - return tuple(instance for instance in instance_list if instance.state in state_set) - - def delete_runner( - self, instance_id: InstanceId, remove_token: str - ) -> runner_metrics.RunnerMetrics | None: - """Delete self-hosted runners. - - Args: - instance_id: The instance id of the runner to delete. - remove_token: The GitHub remove token. - - Returns: - Any metrics collected during the deletion of the runner. - """ - instance = self._openstack_cloud.get_instance(instance_id) - if instance is None: - logger.warning( - "Unable to delete instance %s as it is not found", - self._openstack_cloud.get_server_name(instance_id), - ) - return None - - extracted_metrics = runner_metrics.extract( - metrics_storage_manager=metrics_storage, runners=set([instance.server_name]) - ) - self._delete_runner(instance, remove_token) - return next(extracted_metrics, None) - - def flush_runners( - self, remove_token: str, busy: bool = False - ) -> Iterator[runner_metrics.RunnerMetrics]: - """Remove idle and/or busy runners. - - Args: - remove_token: - busy: If false, only idle runners are removed. If true, both idle and busy runners are - removed. - - Returns: - Any metrics retrieved from flushed runners. - """ - instance_list = self._openstack_cloud.get_instances() - for instance in instance_list: - try: - self._check_state_and_flush(instance, busy) - except SSHError: - logger.warning( - "Unable to determine state of %s and kill runner process due to SSH issues", - instance.server_name, - ) - continue - return self.cleanup(remove_token) - - def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]: - """Cleanup runner and resource on the cloud. - - Args: - remove_token: The GitHub remove token. - - Returns: - Any metrics retrieved from cleanup runners. - """ - runners = self._get_runners_health() - healthy_runner_names = [runner.server_name for runner in runners.healthy] - metrics = runner_metrics.extract( - metrics_storage_manager=metrics_storage, runners=set(healthy_runner_names) - ) - for runner in runners.unhealthy: - self._delete_runner(runner, remove_token) - - self._openstack_cloud.cleanup() - return metrics - - def _delete_runner(self, instance: OpenstackInstance, remove_token: str) -> None: - """Delete self-hosted runners by openstack instance. - - Args: - instance: The OpenStack instance. - remove_token: The GitHub remove token. - """ - try: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - self._pull_runner_metrics(instance.server_name, ssh_conn) - - try: - OpenStackRunnerManager._run_runner_removal_script( - instance.server_name, ssh_conn, remove_token - ) - except _GithubRunnerRemoveError: - logger.warning( - "Unable to run github runner removal script for %s", - instance.server_name, - stack_info=True, - ) - except SSHError: - logger.exception( - "Failed to get SSH connection while removing %s", instance.server_name - ) - logger.warning( - "Skipping runner remove script for %s due to SSH issues", instance.server_name - ) - - try: - self._openstack_cloud.delete_instance(instance.instance_id) - except OpenStackError: - logger.exception( - "Unable to delete openstack instance for runner %s", instance.server_name - ) - - def _get_runners_health(self) -> _RunnerHealth: - """Get runners by health state. - - Returns: - Runners by health state. - """ - runner_list = self._openstack_cloud.get_instances() - - healthy, unhealthy = [], [] - for runner in runner_list: - if self._runner_health_check(runner): - healthy.append(runner) - else: - unhealthy.append(runner) - return _RunnerHealth(healthy=tuple(healthy), unhealthy=tuple(unhealthy)) - - def _runner_health_check(self, instance: OpenstackInstance) -> bool: - """Run health check on a runner. - - Args: - instance: The instance hosting the runner to run health check on. - - Returns: - True if runner is healthy. - """ - cloud_state = CloudRunnerState.from_openstack_server_status(instance.status) - return cloud_state not in set( - ( - CloudRunnerState.DELETED, - CloudRunnerState.ERROR, - CloudRunnerState.STOPPED, - ) - ) and self._health_check(instance) - - def _generate_cloud_init(self, instance_name: str, registration_token: str) -> str: - """Generate cloud init userdata. - - This is the script the openstack server runs on startup. - - Args: - instance_name: The name of the instance. - registration_token: The GitHub runner registration token. - - Returns: - The cloud init userdata for openstack instance. - """ - jinja = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"), autoescape=True) - - env_contents = jinja.get_template("env.j2").render( - pre_job_script=str(PRE_JOB_SCRIPT), - dockerhub_mirror=self._service_config.dockerhub_mirror or "", - ssh_debug_info=( - secrets.choice(self._service_config.ssh_debug_connections) - if self._service_config.ssh_debug_connections - else None - ), - ) - - pre_job_contents_dict = { - "issue_metrics": True, - "metrics_exchange_path": str(METRICS_EXCHANGE_PATH), - "do_repo_policy_check": False, - } - repo_policy = self._get_repo_policy_compliance_client() - if repo_policy is not None: - pre_job_contents_dict.update( - { - "repo_policy_base_url": repo_policy.base_url, - "repo_policy_one_time_token": repo_policy.get_one_time_token(), - "do_repo_policy_check": True, - } - ) - - pre_job_contents = jinja.get_template("pre-job.j2").render(pre_job_contents_dict) - - runner_group = None - if isinstance(self._runner_config.github_path, GitHubOrg): - runner_group = self._runner_config.github_path.group - aproxy_address = ( - self._service_config.proxy_config.aproxy_address - if self._service_config.proxy_config is not None - else None - ) - return jinja.get_template("openstack-userdata.sh.j2").render( - github_url=f"https://github.com/{self._runner_config.github_path.path()}", - runner_group=runner_group, - token=registration_token, - instance_labels=",".join(self._runner_config.labels), - instance_name=instance_name, - env_contents=env_contents, - pre_job_contents=pre_job_contents, - metrics_exchange_path=str(METRICS_EXCHANGE_PATH), - aproxy_address=aproxy_address, - dockerhub_mirror=self._service_config.dockerhub_mirror, - ) - - def _get_repo_policy_compliance_client(self) -> RepoPolicyComplianceClient | None: - """Get repo policy compliance client. - - Returns: - The repo policy compliance client. - """ - if self._service_config.repo_policy_compliance is not None: - return RepoPolicyComplianceClient( - self._service_config.repo_policy_compliance.url, - self._service_config.repo_policy_compliance.token, - ) - return None - - @retry(tries=3, delay=5, backoff=2, local_logger=logger) - def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> None: - """Kill runner process depending on idle or busy. - - Due to update to runner state has some delay with GitHub API. The state of the runner is - determined by which runner processes are running. If the Runner.Worker process is running, - the runner is deemed to be busy. - - Raises: - SSHError: Unable to check the state of the runner and kill the runner process due to - SSH failure. - - Args: - instance: The openstack instance to kill the runner process. - busy: Kill the process if runner is busy, else only kill runner - process if runner is idle. - """ - try: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - except KeyfileError: - logger.exception( - "Health check failed due to unable to find keyfile for %s", instance.server_name - ) - return - except SSHError: - logger.exception( - "SSH connection failure with %s during flushing", instance.server_name - ) - raise - - # Using a single command to determine the state and kill the process if needed. - # This makes it more robust when network is unstable. - if busy: - logger.info("Attempting to kill all runner process on %s", instance.server_name) - # kill both Runner.Listener and Runner.Worker processes. - # This kills pre-job.sh, a child process of Runner.Worker. - kill_command = ( - f"pgrep -x {RUNNER_LISTENER_PROCESS} && kill $(pgrep -x {RUNNER_LISTENER_PROCESS});" - f"pgrep -x {RUNNER_WORKER_PROCESS} && kill $(pgrep -x {RUNNER_WORKER_PROCESS});" - ) - else: - logger.info( - "Attempting to kill runner process on %s if not busy", instance.server_name - ) - # Only kill Runner.Listener if Runner.Worker does not exist. - kill_command = ( - f"pgrep -x {RUNNER_WORKER_PROCESS} || pgrep -x {RUNNER_LISTENER_PROCESS} && " - f"kill $(pgrep -x {RUNNER_LISTENER_PROCESS})" - ) - # Checking the result of kill command is not useful, as the exit code does not reveal much. - ssh_conn.run(kill_command, warn=True) - - @retry(tries=3, delay=5, backoff=2, local_logger=logger) - def _health_check(self, instance: OpenstackInstance) -> bool: - """Check whether runner is healthy. - - Args: - instance: The OpenStack instance to conduit the health check. - - Raises: - SSHError: Unable to get a SSH connection to the instance. - - Returns: - Whether the runner is healthy. - """ - try: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - except KeyfileError: - logger.exception( - "Health check failed due to unable to find keyfile for %s", instance.server_name - ) - return False - except SSHError: - logger.exception( - "SSH connection failure with %s during health check", instance.server_name - ) - raise - return OpenStackRunnerManager._run_health_check(ssh_conn, instance.server_name) - - @staticmethod - def _run_health_check(ssh_conn: SSHConnection, name: str) -> bool: - """Run a health check for runner process. - - Args: - ssh_conn: The SSH connection to the runner. - name: The name of the runner. - - Returns: - Whether the health succeed. - """ - result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) - if not result.ok: - logger.warning("SSH run of `ps aux` failed on %s: %s", name, result.stderr) - return False - if ( - RUNNER_WORKER_PROCESS not in result.stdout - and RUNNER_LISTENER_PROCESS not in result.stdout - ): - logger.warning("Runner process not found on %s", name) - return False - return True - - @retry(tries=10, delay=60, local_logger=logger) - def _wait_runner_startup(self, instance: OpenstackInstance) -> None: - """Wait until runner is startup. - - Args: - instance: The runner instance. - - Raises: - RunnerStartError: The runner startup process was not found on the runner. - """ - try: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - except SSHError as err: - raise RunnerStartError( - f"Failed to SSH to {instance.server_name} during creation possible due to setup " - "not completed" - ) from err - - result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) - if not result.ok: - logger.warning("SSH run of `ps aux` failed on %s", instance.server_name) - raise RunnerStartError(f"Unable to SSH run `ps aux` on {instance.server_name}") - if RUNNER_STARTUP_PROCESS not in result.stdout: - logger.warning("Runner startup process not found on %s", instance.server_name) - raise RunnerStartError(f"Runner startup process not found on {instance.server_name}") - logger.info("Runner startup process found to be healthy on %s", instance.server_name) - - @retry(tries=5, delay=60, local_logger=logger) - def _wait_runner_running(self, instance: OpenstackInstance) -> None: - """Wait until runner is running. - - Args: - instance: The runner instance. - - Raises: - RunnerStartError: The runner process was not found on the runner. - """ - try: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - except SSHError as err: - raise RunnerStartError( - f"Failed to SSH connect to {instance.server_name} openstack runner" - ) from err - - if not self._run_health_check(ssh_conn=ssh_conn, name=instance.server_name): - logger.info("Runner process not found on %s", instance.server_name) - raise RunnerStartError( - f"Runner process on {instance.server_name} failed to initialize on after starting" - ) - - logger.info("Runner process found to be healthy on %s", instance.server_name) - - @staticmethod - def _generate_instance_id() -> InstanceId: - """Generate a instance id. - - Return: - The id. - """ - return secrets.token_hex(12) - - @staticmethod - def _issue_runner_installed_metric( - name: str, - flavor: str, - install_start_timestamp: float, - install_end_timestamp: float, - ) -> None: - """Issue metric for runner installed event. - - Args: - name: The name of the runner. - flavor: The flavor of the runner. - install_start_timestamp: The timestamp of installation start. - install_end_timestamp: The timestamp of installation end. - """ - try: - metric_events.issue_event( - event=metric_events.RunnerInstalled( - timestamp=install_start_timestamp, - flavor=flavor, - duration=install_end_timestamp - install_start_timestamp, - ) - ) - except IssueMetricEventError: - logger.exception("Failed to issue RunnerInstalled metric") - - try: - storage = metrics_storage.create(name) - except CreateMetricsStorageError: - logger.exception( - "Failed to create metrics storage for runner %s, " - "will not be able to issue all metrics.", - name, - ) - else: - try: - (storage.path / runner_metrics.RUNNER_INSTALLED_TS_FILE_NAME).write_text( - str(install_end_timestamp), encoding="utf-8" - ) - except FileNotFoundError: - logger.exception( - "Failed to write runner-installed.timestamp into metrics storage " - "for runner %s, will not be able to issue all metrics.", - name, - ) - - @staticmethod - def _pull_runner_metrics(name: str, ssh_conn: SSHConnection) -> None: - """Pull metrics from runner. - - Args: - name: The name of the runner. - ssh_conn: The SSH connection to the runner. - """ - try: - storage = metrics_storage.get(name) - except GetMetricsStorageError: - logger.exception( - "Failed to get shared metrics storage for runner %s, " - "will not be able to issue all metrics.", - name, - ) - return - - try: - OpenStackRunnerManager._ssh_pull_file( - ssh_conn=ssh_conn, - remote_path=str(METRICS_EXCHANGE_PATH / "pre-job-metrics.json"), - local_path=str(storage.path / "pre-job-metrics.json"), - max_size=MAX_METRICS_FILE_SIZE, - ) - OpenStackRunnerManager._ssh_pull_file( - ssh_conn=ssh_conn, - remote_path=str(METRICS_EXCHANGE_PATH / "post-job-metrics.json"), - local_path=str(storage.path / "post-job-metrics.json"), - max_size=MAX_METRICS_FILE_SIZE, - ) - except _PullFileError as exc: - logger.warning( - "Failed to pull metrics for %s: %s . Will not be able to issue all metrics", - name, - exc, - ) - - @staticmethod - def _ssh_pull_file( - ssh_conn: SSHConnection, remote_path: str, local_path: str, max_size: int - ) -> None: - """Pull file from the runner instance. - - Args: - ssh_conn: The SSH connection instance. - remote_path: The file path on the runner instance. - local_path: The local path to store the file. - max_size: If the file is larger than this, it will not be pulled. - - Raises: - _PullFileError: Unable to pull the file from the runner instance. - SSHError: Issue with SSH connection. - """ - try: - result = ssh_conn.run(f"stat -c %s {remote_path}", warn=True) - except ( - TimeoutError, - paramiko.ssh_exception.NoValidConnectionsError, - paramiko.ssh_exception.SSHException, - ) as exc: - raise SSHError(f"Unable to SSH into {ssh_conn.host}") from exc - if not result.ok: - logger.warning( - ( - "Unable to get file size of %s on instance %s, " - "exit code: %s, stdout: %s, stderr: %s" - ), - remote_path, - ssh_conn.host, - result.return_code, - result.stdout, - result.stderr, - ) - raise _PullFileError(f"Unable to get file size of {remote_path}") - - stdout = result.stdout - try: - stdout.strip() - size = int(stdout) - if size > max_size: - raise _PullFileError(f"File size of {remote_path} too large {size} > {max_size}") - except ValueError as exc: - raise _PullFileError(f"Invalid file size for {remote_path}: stdout") from exc - - try: - ssh_conn.get(remote=remote_path, local=local_path) - except ( - TimeoutError, - paramiko.ssh_exception.NoValidConnectionsError, - paramiko.ssh_exception.SSHException, - ) as exc: - raise SSHError(f"Unable to SSH into {ssh_conn.host}") from exc - except OSError as exc: - raise _PullFileError(f"Unable to retrieve file {remote_path}") from exc - - @staticmethod - def _run_runner_removal_script( - instance_name: str, ssh_conn: SSHConnection, remove_token: str - ) -> None: - """Run Github runner removal script. - - Args: - instance_name: The name of the runner instance. - ssh_conn: The SSH connection to the runner instance. - remove_token: The GitHub instance removal token. - - Raises: - _GithubRunnerRemoveError: Unable to remove runner from GitHub. - """ - try: - result = ssh_conn.run( - f"{_CONFIG_SCRIPT_PATH} remove --token {remove_token}", - warn=True, - ) - if result.ok: - return - - logger.warning( - ( - "Unable to run removal script on instance %s, " - "exit code: %s, stdout: %s, stderr: %s" - ), - instance_name, - result.return_code, - result.stdout, - result.stderr, - ) - raise _GithubRunnerRemoveError(f"Failed to remove runner {instance_name} from Github.") - except ( - TimeoutError, - paramiko.ssh_exception.NoValidConnectionsError, - paramiko.ssh_exception.SSHException, - ) as exc: - raise _GithubRunnerRemoveError( - f"Failed to remove runner {instance_name} from Github." - ) from exc diff --git a/src/reactive/__init__.py b/src/reactive/__init__.py deleted file mode 100644 index 1c7b82dda..000000000 --- a/src/reactive/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Package for code implementing reactive scheduling.""" diff --git a/src/reactive/consumer.py b/src/reactive/consumer.py deleted file mode 100644 index f868feddd..000000000 --- a/src/reactive/consumer.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Module responsible for consuming jobs from the message queue.""" -import contextlib -import logging -import signal -import sys -from contextlib import closing -from types import FrameType -from typing import Generator, cast - -from kombu import Connection -from kombu.simple import SimpleQueue -from pydantic import BaseModel, HttpUrl, ValidationError - -logger = logging.getLogger(__name__) - - -class JobDetails(BaseModel): - """A class to translate the payload. - - Attributes: - labels: The labels of the job. - run_url: The URL of the job. - """ - - labels: list[str] - run_url: HttpUrl - - -class JobError(Exception): - """Raised when a job error occurs.""" - - -def consume(mongodb_uri: str, queue_name: str) -> None: - """Consume a job from the message queue. - - Log the job details and acknowledge the message. - If the job details are invalid, reject the message and raise an error. - - Args: - mongodb_uri: The URI of the MongoDB database. - queue_name: The name of the queue. - - Raises: - JobError: If the job details are invalid. - """ - with Connection(mongodb_uri) as conn: - with closing(SimpleQueue(conn, queue_name)) as simple_queue: - with signal_handler(signal.SIGTERM): - msg = simple_queue.get(block=True) - try: - job_details = cast(JobDetails, JobDetails.parse_raw(msg.payload)) - except ValidationError as exc: - msg.reject(requeue=True) - raise JobError(f"Invalid job details: {msg.payload}") from exc - logger.info( - "Received job with labels %s and run_url %s", - job_details.labels, - job_details.run_url, - ) - msg.ack() - - -@contextlib.contextmanager -def signal_handler(signal_code: signal.Signals) -> Generator[None, None, None]: - """Set a signal handler and after the context, restore the default handler. - - The signal handler exits the process. - - Args: - signal_code: The signal code to handle. - """ - _set_signal_handler(signal_code) - try: - yield - finally: - _restore_signal_handler(signal_code) - - -def _set_signal_handler(signal_code: signal.Signals) -> None: - """Set a signal handler which exits the process. - - Args: - signal_code: The signal code to handle. - """ - - def sigterm_handler(signal_code: int, _: FrameType | None) -> None: - """Handle a signal. - - Call sys.exit with the signal code. Kombu should automatically - requeue unacknowledged messages. - - Args: - signal_code: The signal code to handle. - """ - print( - f"Signal '{signal.strsignal(signal_code)}' received. Will terminate.", file=sys.stderr - ) - sys.exit(signal_code) - - signal.signal(signal_code, sigterm_handler) - - -def _restore_signal_handler(signal_code: signal.Signals) -> None: - """Restore the default signal handler. - - Args: - signal_code: The signal code to restore. - """ - signal.signal(signal_code, signal.SIG_DFL) diff --git a/src/reactive/runner_manager.py b/src/reactive/runner_manager.py deleted file mode 100644 index 5799731ee..000000000 --- a/src/reactive/runner_manager.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Module for managing reactive runners.""" -import logging -import os -import shutil -import signal - -# All commands run by subprocess are secure. -import subprocess # nosec -from pathlib import Path - -from utilities import secure_run_subprocess - -logger = logging.getLogger(__name__) - -MQ_URI_ENV_VAR = "MQ_URI" -QUEUE_NAME_ENV_VAR = "QUEUE_NAME" -REACTIVE_RUNNER_LOG_DIR = Path("/var/log/reactive_runner") -REACTIVE_RUNNER_SCRIPT_FILE = "scripts/reactive_runner.py" -PYTHON_BIN = "/usr/bin/python3" -REACTIVE_RUNNER_CMD_LINE_PREFIX = f"{PYTHON_BIN} {REACTIVE_RUNNER_SCRIPT_FILE}" -PID_CMD_COLUMN_WIDTH = len(REACTIVE_RUNNER_CMD_LINE_PREFIX) -PIDS_COMMAND_LINE = [ - "ps", - "axo", - f"cmd:{PID_CMD_COLUMN_WIDTH},pid", - "--no-headers", - "--sort=-start_time", -] -UBUNTU_USER = "ubuntu" - - -class ReactiveRunnerError(Exception): - """Raised when a reactive runner error occurs.""" - - -def reconcile(quantity: int, mq_uri: str, queue_name: str) -> int: - """Spawn a runner reactively. - - Args: - quantity: The number of runners to spawn. - mq_uri: The message queue URI. - queue_name: The name of the queue. - - Raises a ReactiveRunnerError if the runner fails to spawn. - - Returns: - The number of reactive runner processes spawned. - """ - pids = _get_pids() - current_quantity = len(pids) - logger.info("Current quantity of reactive runner processes: %s", current_quantity) - delta = quantity - current_quantity - if delta > 0: - logger.info("Will spawn %d new reactive runner process(es)", delta) - _setup_logging_for_processes() - for _ in range(delta): - _spawn_runner(mq_uri=mq_uri, queue_name=queue_name) - elif delta < 0: - logger.info("Will kill %d process(es).", -delta) - for pid in pids[:-delta]: - logger.info("Killing reactive runner process with pid %s", pid) - try: - os.kill(pid, signal.SIGTERM) - except ProcessLookupError: - # There can be a race condition that the process has already terminated. - # We just ignore and log the fact. - logger.info( - "Failed to kill process with pid %s. Process might have terminated it self.", - pid, - ) - else: - logger.info("No changes to number of reactive runner processes needed.") - - return delta - - -def _get_pids() -> list[int]: - """Get the PIDs of the reactive runners processes. - - Returns: - The PIDs of the reactive runner processes sorted by start time in descending order. - - Raises: - ReactiveRunnerError: If the command to get the PIDs fails - """ - result = secure_run_subprocess(cmd=PIDS_COMMAND_LINE) - if result.returncode != 0: - raise ReactiveRunnerError("Failed to get list of processes") - - return [ - int(line.rstrip().rsplit(maxsplit=1)[-1]) - for line in result.stdout.decode().split("\n") - if line.startswith(REACTIVE_RUNNER_CMD_LINE_PREFIX) - ] - - -def _setup_logging_for_processes() -> None: - """Set up the log dir.""" - if not REACTIVE_RUNNER_LOG_DIR.exists(): - REACTIVE_RUNNER_LOG_DIR.mkdir() - shutil.chown(REACTIVE_RUNNER_LOG_DIR, user=UBUNTU_USER, group=UBUNTU_USER) - - -def _spawn_runner(mq_uri: str, queue_name: str) -> None: - """Spawn a runner. - - Args: - mq_uri: The message queue URI. - queue_name: The name of the queue. - """ - env = { - "PYTHONPATH": "src:lib:venv", - MQ_URI_ENV_VAR: mq_uri, - QUEUE_NAME_ENV_VAR: queue_name, - } - # We do not want to wait for the process to finish, so we do not use with statement. - # We trust the command. - command = " ".join( - [ - PYTHON_BIN, - REACTIVE_RUNNER_SCRIPT_FILE, - ">>", - # $$ will be replaced by the PID of the process, so we can track the error log easily. - f"{REACTIVE_RUNNER_LOG_DIR}/$$.log", - "2>&1", - ] - ) - logger.debug("Spawning a new reactive runner process with command: %s", command) - process = subprocess.Popen( # pylint: disable=consider-using-with # nosec - command, - shell=True, - env=env, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - user=UBUNTU_USER, - ) - - logger.info("Spawned a new reactive runner process with pid %s", process.pid) diff --git a/src/repo_policy_compliance_client.py b/src/repo_policy_compliance_client.py deleted file mode 100644 index 6dbc1d919..000000000 --- a/src/repo_policy_compliance_client.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Client for requesting repo policy compliance service.""" - -import logging -from urllib.parse import urljoin - -import requests -import urllib3 - -logger = logging.getLogger(__name__) - - -# Disable pylint public method number check as this class can be extended in the future. -class RepoPolicyComplianceClient: # pylint: disable=too-few-public-methods - """Client for repo policy compliance service. - - Attributes: - base_url: Base url to the repo policy compliance service. - token: Charm token configured for the repo policy compliance service. - """ - - def __init__(self, url: str, charm_token: str) -> None: - """Construct the RepoPolicyComplianceClient. - - Args: - url: Base URL to the repo policy compliance service. - charm_token: Charm token configured for the repo policy compliance service. - """ - self._session = self._create_session() - self.base_url = url - self.token = charm_token - - def get_one_time_token(self) -> str: - """Get a single-use token for repo policy compliance check. - - Raises: - HTTPError: If there was an error getting one-time token from repo-policy-compliance \ - service. - - Returns: - The one-time token to be used in a single request of repo policy compliance check. - """ - url = urljoin(self.base_url, "one-time-token") - try: - response = self._session.get(url, headers={"Authorization": f"Bearer {self.token}"}) - response.raise_for_status() - return response.content.decode("utf-8") - except requests.HTTPError: - logger.exception("Unable to get one time token from repo policy compliance service.") - raise - - def _create_session(self) -> requests.Session: - """Create a new requests session. - - Returns: - A new requests session with retries and no proxy settings. - """ - # The repo policy compliance service might be on localhost and should not have any proxies - # setting configured. This can be changed in the future when we also rely on an - # external service for LXD cloud. - adapter = requests.adapters.HTTPAdapter( - max_retries=urllib3.Retry( - total=3, backoff_factor=0.3, status_forcelist=[500, 502, 503, 504] - ) - ) - - session = requests.Session() - session.mount("http://", adapter) - session.mount("https://", adapter) - session.trust_env = False - return session diff --git a/src/runner.py b/src/runner.py index 61a12115c..a1b64fcfc 100644 --- a/src/runner.py +++ b/src/runner.py @@ -21,9 +21,12 @@ from typing import Iterable, NamedTuple, Optional, Sequence import yaml +from github_runner_manager.metrics.runner_logs import SYSLOG_PATH, create_logs_dir +from github_runner_manager.metrics.storage import MetricsStorage +from github_runner_manager.types_.github import GitHubOrg import shared_fs -from charm_state import Arch, GitHubOrg, SSHDebugConnection, VirtualMachineResources +from charm_state import Arch, SSHDebugConnection, VirtualMachineResources from errors import ( CreateMetricsStorageError, GithubClientError, @@ -38,8 +41,6 @@ ) from lxd import LxdInstance from lxd_type import LxdInstanceConfig -from metrics.runner_logs import SYSLOG_PATH, create_logs_dir -from metrics.storage import MetricsStorage from runner_manager_type import RunnerManagerClients from runner_type import RunnerConfig, RunnerStatus from utilities import execute_command, retry diff --git a/src/runner_manager.py b/src/runner_manager.py index 31c30ef85..66a7e03d3 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -13,12 +13,19 @@ from pathlib import Path from typing import Iterator, Optional, Type +import github_runner_manager.reactive.runner_manager as reactive_runner_manager import jinja2 import requests import requests.adapters import urllib3 +from github_runner_manager.metrics import events as metric_events +from github_runner_manager.metrics import github as github_metrics +from github_runner_manager.metrics import runner as runner_metrics +from github_runner_manager.metrics import runner_logs +from github_runner_manager.metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME +from github_runner_manager.repo_policy_compliance_client import RepoPolicyComplianceClient +from github_runner_manager.types_.github import RunnerApplication, SelfHostedRunner -import reactive.runner_manager as reactive_runner_manager import shared_fs from charm_state import VirtualMachineResources from errors import ( @@ -32,14 +39,7 @@ SubprocessError, ) from github_client import GithubClient -from github_type import RunnerApplication, SelfHostedRunner from lxd import LxdClient, LxdInstance -from metrics import events as metric_events -from metrics import github as github_metrics -from metrics import runner as runner_metrics -from metrics import runner_logs -from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME -from repo_policy_compliance_client import RepoPolicyComplianceClient from runner import LXD_PROFILE_YAML, CreateRunnerConfig, Runner, RunnerConfig, RunnerStatus from runner_manager_type import ( LXDFlushMode, diff --git a/src/runner_manager_type.py b/src/runner_manager_type.py index 95f8edcc3..deb30540b 100644 --- a/src/runner_manager_type.py +++ b/src/runner_manager_type.py @@ -9,12 +9,12 @@ from typing import Iterable import jinja2 +from github_runner_manager.repo_policy_compliance_client import RepoPolicyComplianceClient +from github_runner_manager.types_.github import GitHubPath, GitHubRunnerStatus -from charm_state import CharmState, GitHubPath, ReactiveConfig +from charm_state import CharmState, ReactiveConfig from github_client import GithubClient -from github_type import GitHubRunnerStatus from lxd import LxdClient -from repo_policy_compliance_client import RepoPolicyComplianceClient class LXDFlushMode(Enum): diff --git a/src/runner_type.py b/src/runner_type.py index 92560cbcf..eec8793ee 100644 --- a/src/runner_type.py +++ b/src/runner_type.py @@ -8,7 +8,9 @@ from pathlib import Path from typing import Optional -from charm_state import GitHubPath, SSHDebugConnection +from github_runner_manager.types_.github import GitHubPath + +from charm_state import SSHDebugConnection @dataclass diff --git a/src/shared_fs.py b/src/shared_fs.py index 28e97c4fb..48c392113 100644 --- a/src/shared_fs.py +++ b/src/shared_fs.py @@ -7,7 +7,8 @@ from pathlib import Path from typing import Iterator -import metrics.storage as metrics_storage +import github_runner_manager.metrics.storage as metrics_storage + from errors import ( CreateMetricsStorageError, DeleteMetricsStorageError, diff --git a/src/utilities.py b/src/utilities.py index a19effc5c..86c32c4d2 100644 --- a/src/utilities.py +++ b/src/utilities.py @@ -3,14 +3,18 @@ """Utilities used by the charm.""" -import functools import logging import os import pathlib import subprocess # nosec B404 -import time -from typing import Any, Callable, Optional, Sequence, Type, TypeVar - +from typing import Any, Optional, Sequence, TypeVar + +# we import the functions from the utilities module, these are used in the charm +from github_runner_manager.utilities import retry # noqa: F401 pylint: disable=unused-import +from github_runner_manager.utilities import ( # noqa: F401 pylint: disable=unused-import + secure_run_subprocess, + set_env_var, +) from typing_extensions import ParamSpec from errors import SubprocessError @@ -24,130 +28,6 @@ ReturnT = TypeVar("ReturnT") -# This decorator has default arguments, one extra argument is not a problem. -def retry( # pylint: disable=too-many-arguments - exception: Type[Exception] = Exception, - tries: int = 1, - delay: float = 0, - max_delay: Optional[float] = None, - backoff: float = 1, - local_logger: logging.Logger = logger, -) -> Callable[[Callable[ParamT, ReturnT]], Callable[ParamT, ReturnT]]: - """Parameterize the decorator for adding retry to functions. - - Args: - exception: Exception type to be retried. - tries: Number of attempts at retry. - delay: Time in seconds to wait between retry. - max_delay: Max time in seconds to wait between retry. - backoff: Factor to increase the delay by each retry. - local_logger: Logger for logging. - - Returns: - The function decorator for retry. - """ - - def retry_decorator( - func: Callable[ParamT, ReturnT], - ) -> Callable[ParamT, ReturnT]: - """Decorate function with retry. - - Args: - func: The function to decorate. - - Returns: - The resulting function with retry added. - """ - - @functools.wraps(func) - def fn_with_retry(*args: ParamT.args, **kwargs: ParamT.kwargs) -> ReturnT: - """Wrap the function with retries. - - Args: - args: The placeholder for decorated function's positional arguments. - kwargs: The placeholder for decorated function's key word arguments. - - Raises: - RuntimeError: Should be unreachable. - - Returns: - Original return type of the decorated function. - """ - remain_tries, current_delay = tries, delay - - for _ in range(tries): - try: - return func(*args, **kwargs) - # Error caught is set by the input of the function. - except exception as err: # pylint: disable=broad-exception-caught - remain_tries -= 1 - - if remain_tries == 0: - if local_logger is not None: - local_logger.exception("Retry limit of %s exceed: %s", tries, err) - raise - - if local_logger is not None: - local_logger.warning( - "Retrying error in %s seconds: %s", current_delay, err - ) - local_logger.debug("Error to be retried:", stack_info=True) - - time.sleep(current_delay) - - current_delay *= backoff - - if max_delay is not None: - current_delay = min(current_delay, max_delay) - - raise RuntimeError("Unreachable code of retry logic.") - - return fn_with_retry - - return retry_decorator - - -def secure_run_subprocess( - cmd: Sequence[str], hide_cmd: bool = False, **kwargs: dict[str, Any] -) -> subprocess.CompletedProcess[bytes]: - """Run command in subprocess according to security recommendations. - - CalledProcessError will not be raised on error of the command executed. - Errors should be handled by the caller by checking the exit code. - - The command is executed with `subprocess.run`, additional arguments can be passed to it as - keyword arguments. The following arguments to `subprocess.run` should not be set: - `capture_output`, `shell`, `check`. As those arguments are used by this function. - - Args: - cmd: Command in a list. - hide_cmd: Hide logging of cmd. - kwargs: Additional keyword arguments for the `subprocess.run` call. - - Returns: - Object representing the completed process. The outputs subprocess can accessed. - """ - if not hide_cmd: - logger.info("Executing command %s", cmd) - else: - logger.info("Executing sensitive command") - - result = subprocess.run( # nosec B603 - cmd, - capture_output=True, - # Not running in shell to avoid security problems. - shell=False, - check=False, - # Disable type check due to the support for unpacking arguments in mypy is experimental. - **kwargs, # type: ignore - ) - if not hide_cmd: - logger.debug("Command %s returns: %s", cmd, result.stdout) - else: - logger.debug("Command returns: %s", result.stdout) - return result - - def execute_command(cmd: Sequence[str], check_exit: bool = True, **kwargs: Any) -> tuple[str, int]: """Execute a command on a subprocess. @@ -203,19 +83,6 @@ def get_env_var(env_var: str) -> Optional[str]: return os.environ.get(env_var.upper(), os.environ.get(env_var.lower(), None)) -def set_env_var(env_var: str, value: str) -> None: - """Set the environment variable value. - - Set the all upper case and all low case of the `env_var`. - - Args: - env_var: Name of the environment variable. - value: Value to set environment variable to. - """ - os.environ[env_var.upper()] = value - os.environ[env_var.lower()] = value - - def bytes_with_unit_to_kib(num_bytes: str) -> int: """Convert a positive integer followed by a unit to number of kibibytes. diff --git a/templates/openstack-userdata.sh.j2 b/templates/openstack-userdata.sh.j2 deleted file mode 100644 index 047a62be1..000000000 --- a/templates/openstack-userdata.sh.j2 +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/sh - -set -e - -hostnamectl set-hostname github-runner - -# Write .env contents -su - ubuntu -c 'cd ~/actions-runner && echo "{{ env_contents }}" > .env' - -{% if aproxy_address %} -snap install aproxy --edge -snap set aproxy proxy={{ aproxy_address }} listen=:54969 -cat << EOF > /etc/nftables.conf -define default-ip = $(ip route get $(ip route show 0.0.0.0/0 | grep -oP 'via \K\S+') | grep -oP 'src \K\S+') -define private-ips = { 10.0.0.0/8, 127.0.0.1/8, 172.16.0.0/12, 192.168.0.0/16 } -table ip aproxy -flush table ip aproxy -table ip aproxy { - chain prerouting { - type nat hook prerouting priority dstnat; policy accept; - ip daddr != \$private-ips tcp dport { 80, 443 } counter dnat to \$default-ip:54969 - } - - chain output { - type nat hook output priority -100; policy accept; - ip daddr != \$private-ips tcp dport { 80, 443 } counter dnat to \$default-ip:54969 - } -} -EOF -systemctl enable nftables.service -nft -f /etc/nftables.conf -{% endif %} - -adduser ubuntu lxd -adduser ubuntu adm - -{% if dockerhub_mirror %} -echo "{\"registry-mirrors\": [\"{{ dockerhub_mirror }}\"]}" > /etc/docker/daemon.json -sudo systemctl daemon-reload -sudo systemctl restart docker -{% endif %} - -# Prepare metrics -su - ubuntu -c 'mkdir "{{ metrics_exchange_path }}"' - -# Insert pre-job script -cat << 'EOF' | su - ubuntu -c 'tee /home/ubuntu/actions-runner/pre-job.sh' -{{ pre_job_contents | safe }} -EOF - -# Create the runner and start the configuration experience -{% if runner_group %} -su - ubuntu -c "cd ~/actions-runner && ./config.sh \ - --url {{ github_url }} \ - --runnergroup '{{ runner_group }}' \ - --token {{ token }} --ephemeral --unattended \ - --labels {{ instance_labels }} --name {{ instance_name }}" -{% else %} -su - ubuntu -c "cd ~/actions-runner && ./config.sh \ - --url {{ github_url }} \ - --token {{ token }} --ephemeral --unattended \ - --labels {{ instance_labels }} --name {{ instance_name }}" -{% endif %} - - -write_post_metrics(){ - # Expects the exit code of the run.sh script as the first argument. - - # Only write the post-job metrics if the file does not already exist - which may indicate - # that the job has failed inside pre-job. - - if [ -f {{ metrics_exchange_path}}/post-job-metrics.json ]; then - return - fi - - timestamp=$(date +%s) - - # Write the post-job metrics using status abnormal and exit code if exit code is non-zero - if [ "$1" != "0" ]; then - sudo -g ubuntu -u ubuntu jq -n \ - --argjson timestamp "$timestamp" \ - --arg status "abnormal" \ - --argjson exit_code "$1" \ - '{ - "timestamp": $timestamp, - "status": $status, - "status_info": {code: $exit_code} - }' > "{{ metrics_exchange_path}}/post-job-metrics.json" - return - else - # If exit code is zero, write the post-job metrics using status normal - sudo -g ubuntu -u ubuntu jq -n \ - --argjson timestamp "$timestamp" \ - '{ - "timestamp": $timestamp, - "status": "normal" - }' > "{{ metrics_exchange_path }}/post-job-metrics.json" - fi -} - -# Run runner -# We want to capture the exit code of the run.sh script and write the post-job metrics. -(set +e; su - ubuntu -c "cd ~/actions-runner && /home/ubuntu/actions-runner/run.sh"; write_post_metrics $?) - -su - ubuntu -c "touch /home/ubuntu/run-completed" diff --git a/tests/integration/helpers/charm_metrics.py b/tests/integration/helpers/charm_metrics.py index b6c2f05bc..15cd7e3db 100644 --- a/tests/integration/helpers/charm_metrics.py +++ b/tests/integration/helpers/charm_metrics.py @@ -14,12 +14,12 @@ from github.Repository import Repository from github.Workflow import Workflow from github.WorkflowJob import WorkflowJob +from github_runner_manager.metrics.events import METRICS_LOG_PATH +from github_runner_manager.metrics.runner import PostJobStatus +from github_runner_manager.types_.github import JobConclusion from juju.application import Application from juju.unit import Unit -from github_type import JobConclusion -from metrics.events import METRICS_LOG_PATH -from metrics.runner import PostJobStatus from tests.integration.helpers.common import ( InstanceHelper, get_file_content, diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index c15afd5a5..a77c14604 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -6,12 +6,12 @@ from typing import Optional, TypedDict, cast import openstack.connection +from github_runner_manager.openstack_cloud.openstack_cloud import OpenstackCloud from juju.application import Application from juju.unit import Unit from openstack.compute.v2.server import Server from charm_state import VIRTUAL_MACHINES_CONFIG_NAME -from openstack_cloud.openstack_cloud import OpenstackCloud from tests.integration.helpers.common import InstanceHelper, reconcile, run_in_unit, wait_for logger = logging.getLogger(__name__) diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py index e3de1600d..6ce23fa0d 100644 --- a/tests/integration/test_charm_metrics_failure.py +++ b/tests/integration/test_charm_metrics_failure.py @@ -10,12 +10,12 @@ import pytest_asyncio from github.Branch import Branch from github.Repository import Repository +from github_runner_manager.metrics import runner_logs +from github_runner_manager.metrics.runner import PostJobStatus from juju.application import Application from juju.model import Model from charm_state import PATH_CONFIG_NAME, VIRTUAL_MACHINES_CONFIG_NAME -from metrics import runner_logs -from metrics.runner import PostJobStatus from tests.integration.helpers.charm_metrics import ( assert_events_after_reconciliation, cancel_workflow_run, diff --git a/tests/integration/test_charm_metrics_success.py b/tests/integration/test_charm_metrics_success.py index c9b7a8dc0..5e8254e5d 100644 --- a/tests/integration/test_charm_metrics_success.py +++ b/tests/integration/test_charm_metrics_success.py @@ -10,11 +10,11 @@ import pytest_asyncio from github.Branch import Branch from github.Repository import Repository +from github_runner_manager.metrics.runner import PostJobStatus from juju.application import Application from juju.model import Model from charm_state import PATH_CONFIG_NAME, VIRTUAL_MACHINES_CONFIG_NAME -from metrics.runner import PostJobStatus from tests.integration.helpers.charm_metrics import ( assert_events_after_reconciliation, clear_metrics_log, diff --git a/tests/integration/test_reactive.py b/tests/integration/test_reactive.py index b7445be1f..06dc6e48c 100644 --- a/tests/integration/test_reactive.py +++ b/tests/integration/test_reactive.py @@ -6,14 +6,14 @@ import secrets import pytest +from github_runner_manager.reactive.consumer import JobDetails +from github_runner_manager.reactive.runner_manager import REACTIVE_RUNNER_LOG_DIR from juju.application import Application from juju.model import Model from juju.unit import Unit from kombu import Connection from pytest_operator.plugin import OpsTest -from reactive.consumer import JobDetails -from reactive.runner_manager import REACTIVE_RUNNER_LOG_DIR from tests.integration.helpers.common import get_file_content, reconcile, run_in_unit FAKE_URL = "http://example.com" diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 63b7204b3..cb88d84ba 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -1,5 +1,5 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. """Testing the RunnerManager class with OpenStackRunnerManager as CloudManager.""" @@ -15,19 +15,28 @@ from github.Branch import Branch from github.Repository import Repository from github.Workflow import Workflow -from openstack.connection import Connection as OpenstackConnection - -from charm_state import GitHubPath, ProxyConfig, parse_github_path -from manager.cloud_runner_manager import CloudRunnerState, GitHubRunnerConfig, SupportServiceConfig -from manager.github_runner_manager import GitHubRunnerState -from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig -from metrics import events, storage -from openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH -from openstack_cloud.openstack_runner_manager import ( +from github_runner_manager.manager.cloud_runner_manager import ( + CloudRunnerState, + GitHubRunnerConfig, + SupportServiceConfig, +) +from github_runner_manager.manager.github_runner_manager import GitHubRunnerState +from github_runner_manager.manager.runner_manager import ( + FlushMode, + RunnerManager, + RunnerManagerConfig, +) +from github_runner_manager.metrics import events, storage +from github_runner_manager.openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH +from github_runner_manager.openstack_cloud.openstack_runner_manager import ( OpenStackCloudConfig, OpenStackRunnerManager, OpenStackServerConfig, ) +from github_runner_manager.types_.github import GitHubPath, parse_github_path +from openstack.connection import Connection as OpenstackConnection + +from charm_state import ProxyConfig from tests.integration.helpers.common import ( DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, dispatch_workflow, diff --git a/tests/integration/test_self_hosted_runner.py b/tests/integration/test_self_hosted_runner.py index 4232fae4b..46c8280b1 100644 --- a/tests/integration/test_self_hosted_runner.py +++ b/tests/integration/test_self_hosted_runner.py @@ -9,6 +9,7 @@ import github import pytest from github.Repository import Repository +from github_runner_manager.types_.github import GitHubRepo from juju.application import Application from juju.model import Model @@ -16,7 +17,6 @@ DOCKERHUB_MIRROR_CONFIG_NAME, PATH_CONFIG_NAME, VIRTUAL_MACHINES_CONFIG_NAME, - GitHubRepo, ) from github_client import GithubClient from tests.integration.helpers.common import ( diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index cb50275f6..c0b760144 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -8,9 +8,9 @@ from pathlib import Path import pytest +from github_runner_manager.manager.runner_scaler import RunnerScaler import utilities -from manager.runner_scaler import RunnerScaler from tests.unit.mock import MockGhapiClient, MockLxdClient, MockRepoPolicyComplianceClient @@ -76,9 +76,11 @@ def mocks(monkeypatch, tmp_path, exec_command, lxd_exec_command, runner_binary_p monkeypatch.setattr("firewall.Firewall.refresh_firewall", unittest.mock.MagicMock()) monkeypatch.setattr("runner.execute_command", lxd_exec_command) monkeypatch.setattr("runner.shared_fs", unittest.mock.MagicMock()) - monkeypatch.setattr("metrics.events.METRICS_LOG_PATH", Path(tmp_path / "metrics.log")) + monkeypatch.setattr( + "github_runner_manager.metrics.events.METRICS_LOG_PATH", Path(tmp_path / "metrics.log") + ) monkeypatch.setattr("runner.time", unittest.mock.MagicMock()) - monkeypatch.setattr("github_client.GhApi", MockGhapiClient) + monkeypatch.setattr("github_runner_manager.github_client.GhApi", MockGhapiClient) monkeypatch.setattr("runner_manager_type.jinja2", unittest.mock.MagicMock()) monkeypatch.setattr("runner_manager_type.LxdClient", MockLxdClient) monkeypatch.setattr("runner_manager.github_metrics", unittest.mock.MagicMock()) @@ -91,7 +93,7 @@ def mocks(monkeypatch, tmp_path, exec_command, lxd_exec_command, runner_binary_p monkeypatch.setattr( "runner_manager.RepoPolicyComplianceClient", MockRepoPolicyComplianceClient ) - monkeypatch.setattr("utilities.time", unittest.mock.MagicMock()) + monkeypatch.setattr("github_runner_manager.utilities.time", unittest.mock.MagicMock()) @pytest.fixture(autouse=True, name="cloud_name") @@ -108,7 +110,7 @@ def clouds_yaml_path(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Path: Path: Mocked clouds.yaml path. """ clouds_yaml_path = tmp_path / "clouds.yaml" - monkeypatch.setattr("openstack_cloud.CLOUDS_YAML_PATH", clouds_yaml_path) + monkeypatch.setattr("github_runner_manager.openstack_cloud.CLOUDS_YAML_PATH", clouds_yaml_path) return clouds_yaml_path diff --git a/tests/unit/metrics/__init__.py b/tests/unit/metrics/__init__.py deleted file mode 100644 index 188515554..000000000 --- a/tests/unit/metrics/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. diff --git a/tests/unit/metrics/test_events.py b/tests/unit/metrics/test_events.py deleted file mode 100644 index 195768291..000000000 --- a/tests/unit/metrics/test_events.py +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import json -from pathlib import Path - -from metrics import events - -TEST_LOKI_PUSH_API_URL = "http://loki:3100/api/prom/push" - - -def test_issue_events_logs_events(tmp_path: Path): - """ - arrange: Change path of the events log. - act: Issue a metric event. - assert: The expected metric log is created. - """ - event = events.RunnerInstalled(timestamp=123, flavor="small", duration=456) - - events.issue_event(event) - - assert json.loads(events.METRICS_LOG_PATH.read_text()) == { - "event": "runner_installed", - "timestamp": 123, - "flavor": "small", - "duration": 456, - } - - -def test_issue_events_exclude_none_values(tmp_path: Path): - """ - arrange: Change path of the events log. - act: Issue a metric event with a None value. - assert: The expected metric log without the None value is created. - """ - event = events.RunnerStop( - timestamp=123, - flavor="small", - workflow="workflow", - repo="repo", - github_event="github_event", - status="status", - status_info=None, - job_duration=456, - ) - - events.issue_event(event) - - assert json.loads(events.METRICS_LOG_PATH.read_text()) == { - "event": "runner_stop", - "timestamp": 123, - "flavor": "small", - "workflow": "workflow", - "repo": "repo", - "github_event": "github_event", - "status": "status", - "job_duration": 456, - } diff --git a/tests/unit/metrics/test_github.py b/tests/unit/metrics/test_github.py deleted file mode 100644 index 78a21e4e1..000000000 --- a/tests/unit/metrics/test_github.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import secrets -from datetime import datetime, timedelta, timezone -from random import randint -from unittest.mock import MagicMock - -import pytest - -from errors import GithubMetricsError, JobNotFoundError -from github_client import GithubClient -from github_type import JobConclusion, JobStats -from metrics import github as github_metrics -from metrics.runner import PreJobMetrics - - -@pytest.fixture(name="pre_job_metrics") -def pre_job_metrics_fixture() -> PreJobMetrics: - """Create a PreJobMetrics object.""" - return PreJobMetrics( - repository="owner/repo", - workflow_run_id=1, - workflow="workflow", - job_name="job", - job_started_at=datetime(2021, 10, 1, 1, 0, 0, tzinfo=timezone.utc), - timestamp=1234567890, - event="push", - ) - - -def test_job(pre_job_metrics: PreJobMetrics): - """ - arrange: create a GithubClient mock which returns a GithubJobStats object. - act: Call job. - assert: the job metrics are returned. - """ - github_client = MagicMock(spec=GithubClient) - runner_name = secrets.token_hex(16) - created_at = datetime(2021, 10, 1, 0, 0, 0, tzinfo=timezone.utc) - started_at = created_at + timedelta(seconds=3600) - github_client.get_job_info.return_value = JobStats( - created_at=created_at, - started_at=started_at, - runner_name=runner_name, - conclusion=JobConclusion.SUCCESS, - job_id=randint(1, 1000), - ) - - job_metrics = github_metrics.job( - github_client=github_client, pre_job_metrics=pre_job_metrics, runner_name=runner_name - ) - - assert job_metrics.queue_duration == 3600 - assert job_metrics.conclusion == JobConclusion.SUCCESS - - -def test_job_job_not_found(pre_job_metrics: PreJobMetrics): - """ - arrange: create a GithubClient mock which raises a JobNotFound exception. - act: Call job. - assert: a GithubMetricsError is raised. - """ - github_client = MagicMock(spec=GithubClient) - runner_name = secrets.token_hex(16) - github_client.get_job_info.side_effect = JobNotFoundError("Job not found") - - with pytest.raises(GithubMetricsError): - github_metrics.job( - github_client=github_client, pre_job_metrics=pre_job_metrics, runner_name=runner_name - ) diff --git a/tests/unit/metrics/test_runner.py b/tests/unit/metrics/test_runner.py deleted file mode 100644 index bf0a14251..000000000 --- a/tests/unit/metrics/test_runner.py +++ /dev/null @@ -1,649 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import json -import secrets -from pathlib import Path -from unittest.mock import MagicMock, call - -import pytest - -from errors import DeleteMetricsStorageError, IssueMetricEventError -from github_type import JobConclusion -from metrics import events as metric_events -from metrics import runner as runner_metrics -from metrics import type as metrics_type -from metrics.events import RunnerStart, RunnerStop -from metrics.runner import ( - RUNNER_INSTALLED_TS_FILE_NAME, - PostJobMetrics, - PreJobMetrics, - RunnerMetrics, -) -from metrics.storage import MetricsStorage - - -@pytest.fixture(autouse=True, name="issue_event_mock") -def issue_event_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: - """Mock the issue_event function.""" - issue_event_mock = MagicMock() - monkeypatch.setattr("metrics.events.issue_event", issue_event_mock) - return issue_event_mock - - -@pytest.fixture(name="runner_fs_base") -def runner_fs_base_fixture(tmp_path: Path) -> Path: - """Create a runner filesystem base.""" - runner_fs_base = tmp_path / "runner-fs" - runner_fs_base.mkdir(exist_ok=True) - return runner_fs_base - - -def _create_metrics_data(runner_name: str) -> RunnerMetrics: - """Create a RunnerMetrics object that is suitable for most tests. - - Args: - runner_name: The test runner name. - - Returns: - Test metrics data. - """ - return RunnerMetrics( - installed_timestamp=1, - pre_job=PreJobMetrics( - timestamp=1, - workflow="workflow1", - workflow_run_id="workflow_run_id1", - repository="org1/repository1", - event="push", - ), - post_job=PostJobMetrics(timestamp=3, status=runner_metrics.PostJobStatus.NORMAL), - runner_name=runner_name, - ) - - -def _create_runner_fs_base(tmp_path: Path): - """Create a runner filesystem base. - - Args: - tmp_path: The temporary path to create test runner filesystem under. - - Returns: - The runner filesystem temporary path. - """ - runner_fs_base = tmp_path / "runner-fs" - runner_fs_base.mkdir(exist_ok=True) - return runner_fs_base - - -def _create_runner_files( - runner_fs_base: Path, - runner_name: str, - pre_job_data: str | bytes | None, - post_job_data: str | bytes | None, - installed_timestamp: str | bytes | None, -) -> MetricsStorage: - """Create runner files inside shared fs. - - If the data is bytes, the file is written as binary, otherwise as text. - If data is None, it is not written. - - Args: - runner_fs_base: The base path of the shared fs. - runner_name: The runner name. - pre_job_data: The pre-job metrics data. - post_job_data: The post-job metrics data. - installed_timestamp: The installed timestamp. - - Returns: - A SharedFilesystem instance. - """ - runner_fs = runner_fs_base / runner_name - runner_fs.mkdir() - if pre_job_data: - if isinstance(pre_job_data, bytes): - runner_fs.joinpath(runner_metrics.PRE_JOB_METRICS_FILE_NAME).write_bytes(pre_job_data) - else: - runner_fs.joinpath(runner_metrics.PRE_JOB_METRICS_FILE_NAME).write_text( - pre_job_data, encoding="utf-8" - ) - - if post_job_data: - if isinstance(post_job_data, bytes): - runner_fs.joinpath(runner_metrics.POST_JOB_METRICS_FILE_NAME).write_bytes( - post_job_data - ) - else: - runner_fs.joinpath(runner_metrics.POST_JOB_METRICS_FILE_NAME).write_text( - post_job_data, encoding="utf-8" - ) - - if installed_timestamp: - if isinstance(installed_timestamp, bytes): - runner_fs.joinpath(RUNNER_INSTALLED_TS_FILE_NAME).write_bytes(installed_timestamp) - else: - runner_fs.joinpath(RUNNER_INSTALLED_TS_FILE_NAME).write_text( - installed_timestamp, encoding="utf-8" - ) - return MetricsStorage(path=runner_fs, runner_name=runner_name) - - -def test_extract(runner_fs_base: Path): - """ - arrange: \ - 1. A runner with all metrics inside shared fs. \ - 2. A runner with only pre-job metrics inside shared fs. \ - 3. A runner with no metrics except installed_timestamp inside shared fs. - act: Call extract - assert: All shared filesystems are removed and for runners - 1. + 2. metrics are extracted - 3. no metrics are extracted - """ - runner_all_metrics_name = secrets.token_hex(16) - runner_all_metrics = _create_metrics_data(runner_all_metrics_name) - runner_wihout_post_job_name = secrets.token_hex(16) - runner_without_post_job_metrics = runner_all_metrics.copy() - runner_without_post_job_metrics.post_job = None - runner_without_post_job_metrics.runner_name = runner_wihout_post_job_name - - # 1. Runner has all metrics inside shared fs - runner1_fs = _create_runner_files( - runner_fs_base, - runner_all_metrics_name, - runner_all_metrics.pre_job.json(), - runner_all_metrics.post_job.json(), - str(runner_all_metrics.installed_timestamp), - ) - - # 2. Runner has only pre-job metrics inside shared fs - runner2_fs = _create_runner_files( - runner_fs_base, - runner_wihout_post_job_name, - runner_without_post_job_metrics.pre_job.json(), - None, - str(runner_without_post_job_metrics.installed_timestamp), - ) - - # 3. Runner has no metrics except installed_timestamp inside shared fs - runner3_fs = _create_runner_files(runner_fs_base, secrets.token_hex(16), None, None, "5") - - metrics_storage_manager = MagicMock() - metrics_storage_manager.list_all.return_value = [runner1_fs, runner2_fs, runner3_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - - assert extracted_metrics == [ - runner_all_metrics, - runner_without_post_job_metrics, - ] - metrics_storage_manager.delete.assert_has_calls( - [ - ((runner1_fs.runner_name,),), - ((runner2_fs.runner_name,),), - ((runner3_fs.runner_name,),), - ] - ) - - -def test_extract_ignores_runners(runner_fs_base: Path): - """ - arrange: Runners with metrics. - act: Call extract with some runners on ignore list. - expect: The ignored runners are not processed. - """ - runner_metrics_data = [] - - runner_filesystems = [] - for i in range(5): - runner_name = secrets.token_hex(16) - data = _create_metrics_data(runner_name) - data.pre_job.workflow = f"workflow{i}" - runner_metrics_data.append(data) - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - data.pre_job.json(), - data.post_job.json(), - str(data.installed_timestamp), - ) - runner_filesystems.append(runner_fs) - - metrics_storage_manager = MagicMock() - metrics_storage_manager.list_all.return_value = runner_filesystems - - ignore_runners = {runner_filesystems[0].runner_name, runner_filesystems[2].runner_name} - - extracted_metrics = list( - runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, runners=ignore_runners - ) - ) - - assert extracted_metrics == runner_metrics_data[1:2] + runner_metrics_data[3:] - - -def test_extract_corrupt_data(runner_fs_base: Path, monkeypatch: pytest.MonkeyPatch): - """ - arrange: \ - 1. A runner with non-compliant pre-job metrics inside shared fs. \ - 2. A runner with non-json post-job metrics inside shared fs. \ - 3. A runner with json array post-job metrics inside shared fs. \ - 4. A runner with no real timestamp in installed_timestamp file inside shared fs. - act: Call extract. - assert: No metrics are extracted is issued and shared filesystems are quarantined in all cases. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) - - # 1. Runner has noncompliant pre-job metrics inside shared fs - invalid_pre_job_data = runner_metrics_data.pre_job.copy(update={"timestamp": -1}) - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - invalid_pre_job_data.json(), - runner_metrics_data.post_job.json(), - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager = MagicMock() - metrics_storage_manager.list_all.return_value = [runner_fs] - move_to_quarantine_mock = MagicMock() - monkeypatch.setattr(runner_metrics, "move_to_quarantine", move_to_quarantine_mock) - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - - assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - # 2. Runner has non-json post-job metrics inside shared fs - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - b"\x00", - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - # 3. Runner has json post-job metrics but a json array (not object) inside shared fs. - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - json.dumps([runner_metrics_data.post_job.dict()]), - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - # 4. Runner has not a timestamp in installed_timestamp file inside shared fs - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - runner_metrics_data.post_job.json(), - b"\x00", - ) - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - assert not extracted_metrics - - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - -def test_extract_raises_error_for_too_large_files( - runner_fs_base: Path, issue_event_mock: MagicMock, monkeypatch: pytest.MonkeyPatch -): - """ - arrange: Runners with too large metric and timestamp files. - act: Call extract. - assert: No metrics are extracted and shared filesystems is quarantined. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - - # 1. Runner has a pre-job metrics file that is too large - invalid_pre_job_data = runner_metrics_data.pre_job.copy( - update={"workflow": "a" * runner_metrics.FILE_SIZE_BYTES_LIMIT + "b"} - ) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - invalid_pre_job_data.json(), - runner_metrics_data.post_job.json(), - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager = MagicMock() - - metrics_storage_manager.list_all.return_value = [runner_fs] - - move_to_quarantine_mock = MagicMock() - monkeypatch.setattr(runner_metrics, "move_to_quarantine", move_to_quarantine_mock) - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - assert not extracted_metrics - - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - # 2. Runner has a post-job metrics file that is too large - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - invalid_post_job_data = runner_metrics_data.post_job.copy( - update={"status": "a" * runner_metrics.FILE_SIZE_BYTES_LIMIT + "b"} - ) - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - invalid_post_job_data.json(), - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - - assert not extracted_metrics - - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - # 3. Runner has an installed_timestamp file that is too large - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - - invalid_ts = "1" * (runner_metrics.FILE_SIZE_BYTES_LIMIT + 1) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - runner_metrics_data.post_job.json(), - invalid_ts, - ) - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - - assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - -def test_extract_ignores_filesystems_without_ts(runner_fs_base: Path): - """ - arrange: A runner without installed_timestamp file inside shared fs. - act: Call extract. - assert: No metrics are extracted and shared filesystem is removed. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = RunnerMetrics.construct( - installed_timestamp=1, - pre_job=PreJobMetrics( - timestamp=1, - workflow="workflow1", - workflow_run_id="workflow_run_id1", - repository="org1/repository1", - event="push", - ), - post_job=PostJobMetrics(timestamp=3, status=runner_metrics.PostJobStatus.NORMAL), - runner_name=runner_name, - ) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - runner_metrics_data.post_job.json(), - None, - ) - metrics_storage_manager = MagicMock() - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - assert not extracted_metrics - metrics_storage_manager.delete.assert_called_once_with(runner_fs.runner_name) - - -def test_extract_ignores_failure_on_shared_fs_cleanup( - runner_fs_base: Path, - caplog: pytest.LogCaptureFixture, -): - """ - arrange: Mock the shared_fs.delete to raise an exception. - act: Call extract. - assert: The metric is extracted and the exception is caught and logged. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - runner_fs = _create_runner_files( - runner_fs_base, - runner_metrics_data.runner_name, - runner_metrics_data.pre_job.json(), - runner_metrics_data.post_job.json(), - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager = MagicMock() - - metrics_storage_manager.list_all.return_value = [runner_fs] - - metrics_storage_manager.delete.side_effect = DeleteMetricsStorageError( - "Failed to delete shared filesystem" - ) - - extracted_metrics = runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, runners=set() - ) - assert list(extracted_metrics) == [runner_metrics_data] - - assert "Failed to delete shared filesystem" in caplog.text - - -def test_issue_events(issue_event_mock: MagicMock): - """ - arrange: A runner with all metrics. - act: Call issue_events. - assert: RunnerStart and RunnerStop metrics are issued. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - - flavor = secrets.token_hex(16) - job_metrics = metrics_type.GithubJobMetrics( - queue_duration=3600, conclusion=JobConclusion.SUCCESS - ) - issued_metrics = runner_metrics.issue_events( - runner_metrics=runner_metrics_data, flavor=flavor, job_metrics=job_metrics - ) - assert issued_metrics == {metric_events.RunnerStart, metric_events.RunnerStop} - issue_event_mock.assert_has_calls( - [ - # 1. Runner - call( - RunnerStart( - timestamp=runner_metrics_data.pre_job.timestamp, - flavor=flavor, - workflow=runner_metrics_data.pre_job.workflow, - repo=runner_metrics_data.pre_job.repository, - github_event=runner_metrics_data.pre_job.event, - idle=runner_metrics_data.pre_job.timestamp - - runner_metrics_data.installed_timestamp, - queue_duration=job_metrics.queue_duration, - ) - ), - call( - RunnerStop( - timestamp=runner_metrics_data.post_job.timestamp, - flavor=flavor, - workflow=runner_metrics_data.pre_job.workflow, - repo=runner_metrics_data.pre_job.repository, - github_event=runner_metrics_data.pre_job.event, - status=runner_metrics_data.post_job.status, - job_duration=runner_metrics_data.post_job.timestamp - - runner_metrics_data.pre_job.timestamp, - job_conclusion=job_metrics.conclusion, - ) - ), - ] - ) - - -def test_issue_events_pre_job_before_runner_installed(issue_event_mock: MagicMock): - """ - arrange: A runner with pre-job timestamp smaller than installed timestamp. - act: Call issue_events. - assert: RunnerStart metric is issued with idle set to 0. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - runner_metrics_data.pre_job.timestamp = 0 - - flavor = secrets.token_hex(16) - job_metrics = metrics_type.GithubJobMetrics( - queue_duration=3600, conclusion=JobConclusion.SUCCESS - ) - issued_metrics = runner_metrics.issue_events( - runner_metrics=runner_metrics_data, flavor=flavor, job_metrics=job_metrics - ) - assert metric_events.RunnerStart in issued_metrics - issue_event_mock.assert_has_calls( - [ - call( - RunnerStart( - timestamp=runner_metrics_data.pre_job.timestamp, - flavor=flavor, - workflow=runner_metrics_data.pre_job.workflow, - repo=runner_metrics_data.pre_job.repository, - github_event=runner_metrics_data.pre_job.event, - idle=0, - queue_duration=job_metrics.queue_duration, - ) - ) - ] - ) - - -def test_issue_events_post_job_before_pre_job(issue_event_mock: MagicMock): - """ - arrange: A runner with post-job timestamp smaller than pre-job timestamps. - act: Call issue_events. - assert: job_duration is set to zero. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - runner_metrics_data.post_job = PostJobMetrics( - timestamp=0, status=runner_metrics.PostJobStatus.NORMAL - ) - flavor = secrets.token_hex(16) - job_metrics = metrics_type.GithubJobMetrics( - queue_duration=3600, conclusion=JobConclusion.SUCCESS - ) - issued_metrics = runner_metrics.issue_events( - runner_metrics=runner_metrics_data, flavor=flavor, job_metrics=job_metrics - ) - - assert metric_events.RunnerStop in issued_metrics - issue_event_mock.assert_has_calls( - [ - call( - RunnerStop( - timestamp=runner_metrics_data.post_job.timestamp, - flavor=flavor, - workflow=runner_metrics_data.pre_job.workflow, - repo=runner_metrics_data.pre_job.repository, - github_event=runner_metrics_data.pre_job.event, - status=runner_metrics_data.post_job.status, - job_duration=0, - job_conclusion=job_metrics.conclusion, - ) - ), - ] - ) - - -def test_issue_events_no_post_job_metrics(issue_event_mock: MagicMock): - """ - arrange: A runner without post-job metrics. - act: Call issue_events. - assert: Only RunnerStart metric is issued. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - runner_metrics_data.post_job = None - flavor = secrets.token_hex(16) - job_metrics = metrics_type.GithubJobMetrics( - queue_duration=3600, conclusion=JobConclusion.SUCCESS - ) - issued_metrics = runner_metrics.issue_events( - runner_metrics=runner_metrics_data, flavor=flavor, job_metrics=job_metrics - ) - assert issued_metrics == {metric_events.RunnerStart} - - issue_event_mock.assert_called_once_with( - RunnerStart( - timestamp=runner_metrics_data.pre_job.timestamp, - flavor=flavor, - workflow=runner_metrics_data.pre_job.workflow, - repo=runner_metrics_data.pre_job.repository, - github_event=runner_metrics_data.pre_job.event, - idle=runner_metrics_data.pre_job.timestamp - runner_metrics_data.installed_timestamp, - queue_duration=job_metrics.queue_duration, - ) - ) - - -def test_issue_events_returns_empty_set_on_issue_event_failure( - issue_event_mock: MagicMock, - caplog: pytest.LogCaptureFixture, -): - """ - arrange: Mock the issue_event_mock to raise an exception on the first call. - act: Call issue_events. - assert: No metrics at all are issued. The exception is caught and logged. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - - issue_event_mock.side_effect = [IssueMetricEventError("Failed to issue metric"), None] - - flavor = secrets.token_hex(16) - job_metrics = metrics_type.GithubJobMetrics( - queue_duration=3600, conclusion=JobConclusion.SUCCESS - ) - - issued_metrics = runner_metrics.issue_events( - runner_metrics=runner_metrics_data, flavor=flavor, job_metrics=job_metrics - ) - assert not issued_metrics - assert "Failed to issue metric" in caplog.text diff --git a/tests/unit/metrics/test_runner_logs.py b/tests/unit/metrics/test_runner_logs.py deleted file mode 100644 index d53dc17cf..000000000 --- a/tests/unit/metrics/test_runner_logs.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -from pathlib import Path - -import pytest - -from metrics import runner_logs - - -@pytest.fixture(name="log_dir_base_path") -def log_dir_base_path_fixture(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: - """Mock the log directory path and return it.""" - log_dir_base_path = tmp_path / "log_dir" - monkeypatch.setattr(runner_logs, "RUNNER_LOGS_DIR_PATH", log_dir_base_path) - return log_dir_base_path - - -def test_remove_outdated_crashed(log_dir_base_path: Path, monkeypatch: pytest.MonkeyPatch): - """ - arrange: Mock the base log directory path. - act: Remove the logs of the runner. - assert: The expected logs are removed. - """ - monkeypatch.setattr(runner_logs, "OUTDATED_LOGS_IN_SECONDS", 0) - - log_dir_path = log_dir_base_path / "test-runner" - log_dir_path.mkdir(parents=True) - - runner_logs.remove_outdated() - - assert not log_dir_path.exists() diff --git a/tests/unit/metrics/test_storage.py b/tests/unit/metrics/test_storage.py deleted file mode 100644 index bc8d0e94c..000000000 --- a/tests/unit/metrics/test_storage.py +++ /dev/null @@ -1,168 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import secrets -import tarfile -from pathlib import Path - -import pytest - -from errors import ( - CreateMetricsStorageError, - DeleteMetricsStorageError, - GetMetricsStorageError, - QuarantineMetricsStorageError, -) -from metrics import storage -from metrics.storage import MetricsStorage - - -@pytest.fixture(autouse=True, name="filesystem_paths") -def filesystem_paths_fixture(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> dict[str, Path]: - """Mock the hardcoded filesystem paths.""" - ms_path = tmp_path / "runner-fs" - ms_quarantine_path = tmp_path / "quarantine" - monkeypatch.setattr(storage, "FILESYSTEM_BASE_PATH", ms_path) - monkeypatch.setattr(storage, "FILESYSTEM_QUARANTINE_PATH", ms_quarantine_path) - return {"base": ms_path, "quarantine": ms_quarantine_path} - - -def test_create_creates_directory(): - """ - arrange: Given a runner name and a path for the storage. - act: Call create. - assert: The directory is created. - """ - runner_name = secrets.token_hex(16) - - fs = storage.create(runner_name) - - assert fs.path.exists() - assert fs.path.is_dir() - - -def test_create_raises_exception_if_already_exists(): - """ - arrange: Given a runner name and an already existing shared filesystem. - act: Call create. - assert: The expected exception is raised. - """ - runner_name = secrets.token_hex(16) - storage.create(runner_name) - - with pytest.raises(CreateMetricsStorageError): - storage.create(runner_name) - - -def test_list_all(): - """ - arrange: Create metric storages for multiple runners. - act: Call list_all. - assert: A generator listing all the shared filesystems is returned. - """ - runner_names = [secrets.token_hex(16) for _ in range(3)] - for runner_name in runner_names: - storage.create(runner_name) - - fs_list = list(storage.list_all()) - - assert len(fs_list) == 3 - for fs in fs_list: - assert isinstance(fs, storage.MetricsStorage) - assert fs.runner_name in runner_names - - -def test_list_all_empty(): - """ - arrange: Nothing. - act: Call list_all. - assert: An empty iterator is returned. - """ - fs_list = list(storage.list_all()) - - assert len(fs_list) == 0 - - -def test_delete(): - """ - arrange: Create metrics storage for a runner. - act: Call delete - assert: The storage is deleted. - """ - runner_name = secrets.token_hex(16) - storage.create(runner_name) - - storage.delete(runner_name) - - with pytest.raises(GetMetricsStorageError): - storage.get(runner_name) - - -def test_delete_raises_error(): - """ - arrange: Nothing. - act: Call delete. - assert: A DeleteMetricsStorageError is raised. - """ - runner_name = secrets.token_hex(16) - - with pytest.raises(DeleteMetricsStorageError): - storage.delete(runner_name) - - -def test_get(): - """ - arrange: Given a runner name. - act: Call create and get. - assert: A metrics storage object for this runner is returned. - """ - runner_name = secrets.token_hex(16) - - storage.create(runner_name) - ms = storage.get(runner_name) - - assert isinstance(ms, MetricsStorage) - assert ms.runner_name == runner_name - - -def test_get_raises_error_if_not_found(): - """ - arrange: Nothing. - act: Call get. - assert: A GetMetricsStorageError is raised. - """ - runner_name = secrets.token_hex(16) - - with pytest.raises(GetMetricsStorageError): - storage.get(runner_name) - - -def test_quarantine(filesystem_paths: dict[str, Path], tmp_path: Path): - """ - arrange: Create a storage for a runner with a file in it. - act: Call quarantine. - assert: The storage is moved to the quarantine. - """ - runner_name = secrets.token_hex(16) - ms = storage.create(runner_name) - ms.path.joinpath("test.txt").write_text("foo bar") - - storage.move_to_quarantine(storage, runner_name) - - tarfile_path = filesystem_paths["quarantine"].joinpath(runner_name).with_suffix(".tar.gz") - assert tarfile_path.exists() - tarfile.open(tarfile_path).extractall(path=tmp_path) - assert tmp_path.joinpath(f"{runner_name}/test.txt").exists() - assert tmp_path.joinpath(f"{runner_name}/test.txt").read_text(encoding="utf-8") == "foo bar" - assert not ms.path.exists() - - -def test_quarantine_raises_error(): - """ - arrange: Nothing. - act: Call quarantine. - assert: A QuarantineMetricsStorageError is raised. - """ - runner_name = secrets.token_hex(16) - - with pytest.raises(QuarantineMetricsStorageError): - storage.move_to_quarantine(storage, runner_name) diff --git a/tests/unit/mock.py b/tests/unit/mock.py index be3e07ca7..78c0c6990 100644 --- a/tests/unit/mock.py +++ b/tests/unit/mock.py @@ -12,8 +12,9 @@ from pathlib import Path from typing import IO, Optional, Sequence, Union +from github_runner_manager.types_.github import RegistrationToken, RemoveToken, RunnerApplication + from errors import LxdError, RunnerError -from github_type import RegistrationToken, RemoveToken, RunnerApplication from lxd_type import LxdNetwork from runner import LxdInstanceConfig diff --git a/tests/unit/mock_runner_managers.py b/tests/unit/mock_runner_managers.py index 443c84dfd..b52afa538 100644 --- a/tests/unit/mock_runner_managers.py +++ b/tests/unit/mock_runner_managers.py @@ -7,17 +7,18 @@ from typing import Iterable, Iterator, Sequence from unittest.mock import MagicMock -from charm_state import GitHubPath -from github_client import GithubClient -from github_type import GitHubRunnerStatus, SelfHostedRunner -from manager.cloud_runner_manager import ( +from github_runner_manager.manager.cloud_runner_manager import ( CloudRunnerInstance, CloudRunnerManager, CloudRunnerState, InstanceId, ) -from manager.github_runner_manager import GitHubRunnerState -from metrics.runner import RunnerMetrics +from github_runner_manager.manager.github_runner_manager import GitHubRunnerState +from github_runner_manager.metrics.runner import RunnerMetrics +from github_runner_manager.types_.github import GitHubRunnerStatus, SelfHostedRunner + +from charm_state import GitHubPath +from github_client import GithubClient from tests.unit.mock import MockGhapiClient diff --git a/tests/unit/reactive/__init__.py b/tests/unit/reactive/__init__.py deleted file mode 100644 index 188515554..000000000 --- a/tests/unit/reactive/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. diff --git a/tests/unit/reactive/test_consumer.py b/tests/unit/reactive/test_consumer.py deleted file mode 100644 index 2a443c9b3..000000000 --- a/tests/unit/reactive/test_consumer.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -import secrets -from contextlib import closing - -import pytest -from kombu import Connection, Message - -from reactive import consumer -from reactive.consumer import JobError - -IN_MEMORY_URI = "memory://" -FAKE_RUN_URL = "https://api.github.com/repos/fakeusergh-runner-test/actions/runs/8200803099" - - -def test_consume(caplog: pytest.LogCaptureFixture): - """ - arrange: A job placed in the message queue. - act: Call consume - assert: The job is logged. - """ - queue_name = secrets.token_hex(16) - job_details = consumer.JobDetails( - labels=[secrets.token_hex(16), secrets.token_hex(16)], - run_url=FAKE_RUN_URL, - ) - _put_in_queue(job_details.json(), queue_name) - - # we use construct to avoid pydantic validation as IN_MEMORY_URI is not a valid URL - consumer.consume(IN_MEMORY_URI, queue_name) - assert str(job_details.labels) in caplog.text - assert job_details.run_url in caplog.text - - -@pytest.mark.parametrize( - "job_str", - [ - pytest.param( - '{"labels": ["label1", "label2"], "status": "completed"}', id="run_url missing" - ), - pytest.param( - '{"status": "completed", "run_url": "https://example.com"}', id="labels missing" - ), - pytest.param("no json at all", id="invalid json"), - ], -) -def test_job_details_validation_error(job_str: str): - """ - arrange: A job placed in the message queue with invalid details. - act: Call consume - assert: A JobError is raised and the message is requeued. - """ - queue_name = secrets.token_hex(16) - _put_in_queue(job_str, queue_name) - - with pytest.raises(JobError) as exc_info: - consumer.consume(IN_MEMORY_URI, queue_name) - assert "Invalid job details" in str(exc_info.value) - - # Ensure message has been requeued by reconsuming it - msg = _consume_from_queue(queue_name) - assert msg.payload == job_str - - -def _put_in_queue(msg: str, queue_name: str) -> None: - """Put a job in the message queue. - - Args: - msg: The job details. - queue_name: The name of the queue - """ - with Connection(IN_MEMORY_URI) as conn: - with closing(conn.SimpleQueue(queue_name)) as simple_queue: - simple_queue.put(msg, retry=True) - - -def _consume_from_queue(queue_name: str) -> Message: - """Consume a job from the message queue. - - Args: - queue_name: The name of the queue - - Returns: - The message consumed from the queue. - """ - with Connection(IN_MEMORY_URI) as conn: - with closing(conn.SimpleQueue(queue_name)) as simple_queue: - return simple_queue.get(block=False) diff --git a/tests/unit/reactive/test_runner_manager.py b/tests/unit/reactive/test_runner_manager.py deleted file mode 100644 index cd25cf728..000000000 --- a/tests/unit/reactive/test_runner_manager.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import os -import secrets -import subprocess -from pathlib import Path -from subprocess import CompletedProcess -from unittest.mock import MagicMock - -import pytest - -from reactive.runner_manager import ( - PIDS_COMMAND_LINE, - PYTHON_BIN, - REACTIVE_RUNNER_SCRIPT_FILE, - ReactiveRunnerError, - reconcile, -) -from utilities import secure_run_subprocess - -EXAMPLE_MQ_URI = "http://example.com" - - -@pytest.fixture(name="log_dir", autouse=True) -def log_dir_path_fixture(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: - """Return the path to the log file.""" - log_file_path = tmp_path / "logs" - monkeypatch.setattr("reactive.runner_manager.REACTIVE_RUNNER_LOG_DIR", log_file_path) - monkeypatch.setattr("shutil.chown", lambda *args, **kwargs: None) - return log_file_path - - -@pytest.fixture(name="secure_run_subprocess_mock") -def secure_run_subprocess_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: - """Mock the ps command.""" - secure_run_subprocess_mock = MagicMock(spec=secure_run_subprocess) - monkeypatch.setattr( - "reactive.runner_manager.secure_run_subprocess", secure_run_subprocess_mock - ) - return secure_run_subprocess_mock - - -@pytest.fixture(name="os_kill_mock", autouse=True) -def os_kill_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: - """Mock the os.kill function.""" - os_kill_mock = MagicMock(spec=os.kill) - monkeypatch.setattr("os.kill", os_kill_mock) - return os_kill_mock - - -@pytest.fixture(name="subprocess_popen_mock") -def subprocess_popen_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: - """Mock the subprocess.Popen function.""" - popen_result = MagicMock(spec=subprocess.Popen, pid=1234, returncode=0) - subprocess_popen_mock = MagicMock( - spec=subprocess.Popen, - return_value=popen_result, - ) - monkeypatch.setattr("subprocess.Popen", subprocess_popen_mock) - return subprocess_popen_mock - - -def test_reconcile_spawns_runners( - secure_run_subprocess_mock: MagicMock, subprocess_popen_mock: MagicMock, log_dir: Path -): - """ - arrange: Mock that two reactive runner processes are active. - act: Call reconcile with a quantity of 5. - assert: Three runners are spawned. Log file is setup. - """ - queue_name = secrets.token_hex(16) - _arrange_reactive_processes(secure_run_subprocess_mock, count=2) - - delta = reconcile(5, mq_uri=EXAMPLE_MQ_URI, queue_name=queue_name) - - assert delta == 3 - assert subprocess_popen_mock.call_count == 3 - assert log_dir.exists() - - -def test_reconcile_does_not_spawn_runners( - secure_run_subprocess_mock: MagicMock, subprocess_popen_mock: MagicMock -): - """ - arrange: Mock that two reactive runner processes are active. - act: Call reconcile with a quantity of 2. - assert: No runners are spawned. - """ - queue_name = secrets.token_hex(16) - _arrange_reactive_processes(secure_run_subprocess_mock, count=2) - - delta = reconcile(2, mq_uri=EXAMPLE_MQ_URI, queue_name=queue_name) - - assert delta == 0 - assert subprocess_popen_mock.call_count == 0 - - -def test_reconcile_kills_processes_for_too_many_processes( - secure_run_subprocess_mock: MagicMock, - subprocess_popen_mock: MagicMock, - os_kill_mock: MagicMock, -): - """ - arrange: Mock that 3 reactive runner processes are active. - act: Call reconcile with a quantity of 1. - assert: 2 processes are killed. - """ - queue_name = secrets.token_hex(16) - _arrange_reactive_processes(secure_run_subprocess_mock, count=3) - delta = reconcile(1, mq_uri=EXAMPLE_MQ_URI, queue_name=queue_name) - - assert delta == -2 - assert subprocess_popen_mock.call_count == 0 - assert os_kill_mock.call_count == 2 - - -def test_reconcile_ignore_process_not_found_on_kill( - secure_run_subprocess_mock: MagicMock, - subprocess_popen_mock: MagicMock, - os_kill_mock: MagicMock, -): - """ - arrange: Mock 3 reactive processes and os.kill to fail once with a ProcessLookupError. - act: Call reconcile with a quantity of 1. - assert: The returned delta is still -2. - """ - queue_name = secrets.token_hex(16) - _arrange_reactive_processes(secure_run_subprocess_mock, count=3) - os_kill_mock.side_effect = [None, ProcessLookupError] - delta = reconcile(1, mq_uri=EXAMPLE_MQ_URI, queue_name=queue_name) - - assert delta == -2 - assert subprocess_popen_mock.call_count == 0 - assert os_kill_mock.call_count == 2 - - -def test_reconcile_raises_reactive_runner_error_on_ps_failure( - secure_run_subprocess_mock: MagicMock, -): - """ - arrange: Mock that the ps command fails. - act: Call reconcile with a quantity of 1. - assert: A ReactiveRunnerError is raised. - """ - queue_name = secrets.token_hex(16) - secure_run_subprocess_mock.return_value = CompletedProcess( - args=PIDS_COMMAND_LINE, - returncode=1, - stdout=b"", - stderr=b"error", - ) - - with pytest.raises(ReactiveRunnerError) as err: - reconcile(1, mq_uri=EXAMPLE_MQ_URI, queue_name=queue_name) - - assert "Failed to get list of processes" in str(err.value) - - -def _arrange_reactive_processes(secure_run_subprocess_mock: MagicMock, count: int): - """Mock reactive runner processes are active. - - Args: - secure_run_subprocess_mock: The mock to use for the ps command. - count: The number of processes. - """ - process_cmds_before = "\n".join( - [f"{PYTHON_BIN} {REACTIVE_RUNNER_SCRIPT_FILE}\t{i}" for i in range(count)] - ) - - secure_run_subprocess_mock.return_value = CompletedProcess( - args=PIDS_COMMAND_LINE, - returncode=0, - stdout=f"CMD\n{process_cmds_before}".encode("utf-8"), - stderr=b"", - ) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 060bbc96f..a28fc9743 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -12,6 +12,7 @@ import pytest import yaml +from github_runner_manager.types_.github import GitHubOrg, GitHubRepo, GitHubRunnerStatus from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, StatusBase, WaitingStatus from ops.testing import Harness @@ -28,8 +29,6 @@ VM_CPU_CONFIG_NAME, VM_DISK_CONFIG_NAME, Arch, - GitHubOrg, - GitHubRepo, InstanceType, OpenStackCloudsYAML, OpenstackImage, @@ -41,14 +40,12 @@ LogrotateSetupError, MissingMongoDBError, MissingRunnerBinaryError, - OpenStackUnauthorizedError, RunnerError, SubprocessError, TokenError, ) from event_timer import EventTimer, TimerEnableError from firewall import FirewallEntry -from github_type import GitHubRunnerStatus from runner_manager import LXDRunnerManagerConfig, RunnerInfo TEST_PROXY_SERVER_URL = "http://proxy.server:1234" @@ -761,7 +758,6 @@ def test_on_flush_runners_action(self, run, wt, mkdir, rm): pytest.param(ConfigurationError, BlockedStatus, id="charm config error"), pytest.param(TokenError, BlockedStatus, id="github token error"), pytest.param(MissingRunnerBinaryError, MaintenanceStatus, id="runner binary error"), - pytest.param(OpenStackUnauthorizedError, BlockedStatus, id="openstack auth error"), ], ) def test_catch_charm_errors( diff --git a/tests/unit/test_charm_state.py b/tests/unit/test_charm_state.py index d8fdd896d..b7df8a5dc 100644 --- a/tests/unit/test_charm_state.py +++ b/tests/unit/test_charm_state.py @@ -8,20 +8,22 @@ from pathlib import Path from unittest.mock import MagicMock +import github_runner_manager.openstack_cloud import pytest import yaml from charms.data_platform_libs.v0.data_interfaces import DatabaseRequires +from github_runner_manager.types_.github import GitHubOrg, GitHubRepo from pydantic import BaseModel from pydantic.error_wrappers import ValidationError from pydantic.networks import IPv4Address import charm_state -import openstack_cloud from charm_state import ( BASE_IMAGE_CONFIG_NAME, DEBUG_SSH_INTEGRATION_NAME, DENYLIST_CONFIG_NAME, DOCKERHUB_MIRROR_CONFIG_NAME, + GROUP_CONFIG_NAME, IMAGE_INTEGRATION_NAME, LABELS_CONFIG_NAME, OPENSTACK_CLOUDS_YAML_CONFIG_NAME, @@ -41,8 +43,6 @@ CharmState, FirewallEntry, GithubConfig, - GitHubOrg, - GitHubRepo, ImmutableConfigChangedError, LocalLxdRunnerConfig, OpenstackImage, @@ -87,20 +87,21 @@ def test_github_org_path(): assert path == org -def test_parse_github_path_invalid(): +def test_github_config_from_charm_invalud_path(): """ arrange: Create an invalid GitHub path string and runner group name. act: Call parse_github_path with the invalid path string and runner group name. assert: Verify that the function raises CharmConfigInvalidError. """ - path_str = "invalidpath/" - runner_group = "test_group" + mock_charm = MockGithubRunnerCharmFactory() + mock_charm.config[PATH_CONFIG_NAME] = "invalidpath/" + mock_charm.config[GROUP_CONFIG_NAME] = "test_group" with pytest.raises(CharmConfigInvalidError): - charm_state.parse_github_path(path_str, runner_group) + GithubConfig.from_charm(mock_charm) -def test_github_config_from_charm_invalid_path(): +def test_github_config_from_charm_empty_path(): """ arrange: Create a mock CharmBase instance with an empty path configuration. act: Call from_charm method with the mock CharmBase instance. @@ -367,9 +368,9 @@ def test_parse_openstack_clouds_initialize_fail( mock_charm = MockGithubRunnerCharmFactory() mock_charm.config[OPENSTACK_CLOUDS_YAML_CONFIG_NAME] = valid_yaml_config monkeypatch.setattr( - openstack_cloud, + github_runner_manager.openstack_cloud, "initialize", - MagicMock(side_effect=openstack_cloud.OpenStackInvalidConfigError), + MagicMock(side_effect=github_runner_manager.openstack_cloud.OpenStackInvalidConfigError), ) with pytest.raises(CharmConfigInvalidError): diff --git a/tests/unit/test_github_client.py b/tests/unit/test_github_client.py deleted file mode 100644 index 9bd336a03..000000000 --- a/tests/unit/test_github_client.py +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import http -import random -import secrets -from collections import namedtuple -from datetime import datetime, timezone -from unittest.mock import MagicMock -from urllib.error import HTTPError - -import pytest - -from charm_state import GitHubRepo -from errors import JobNotFoundError -from github_client import GithubClient -from github_type import JobConclusion, JobStats - -JobStatsRawData = namedtuple( - "JobStatsRawData", - ["created_at", "started_at", "runner_name", "conclusion", "id"], -) - - -@pytest.fixture(name="job_stats_raw") -def job_stats_fixture() -> JobStatsRawData: - """Create a JobStats object.""" - runner_name = secrets.token_hex(16) - return JobStatsRawData( - created_at="2021-10-01T00:00:00Z", - started_at="2021-10-01T01:00:00Z", - conclusion="success", - runner_name=runner_name, - id=random.randint(1, 1000), - ) - - -@pytest.fixture(name="github_client") -def github_client_fixture(job_stats_raw: JobStatsRawData) -> GithubClient: - """Create a GithubClient object with a mocked GhApi object.""" - gh_client = GithubClient("token") - gh_client._client = MagicMock() - gh_client._client.actions.list_jobs_for_workflow_run.return_value = { - "jobs": [ - { - "created_at": job_stats_raw.created_at, - "started_at": job_stats_raw.started_at, - "runner_name": job_stats_raw.runner_name, - "conclusion": job_stats_raw.conclusion, - "id": job_stats_raw.id, - } - ] - } - - return gh_client - - -def _mock_multiple_pages_for_job_response( - github_client: GithubClient, job_stats_raw: JobStatsRawData, include_runner: bool = True -): - """Mock the list_jobs_for_workflow_run to return multiple pages. - - Args: - github_client: The GithubClient object to mock. - job_stats_raw: The JobStatsRawData object to use for the response. - include_runner: Whether to include the runner in the response for one of the jobs. - """ - no_of_pages = random.choice(range(1, 5)) - no_of_jobs_per_page = random.choice(range(1, 4)) - runner_names = [secrets.token_hex(16) for _ in range(no_of_pages * no_of_jobs_per_page)] - - if include_runner: - runner_names[random.choice(range(no_of_pages))] = job_stats_raw.runner_name - - github_client._client.actions.list_jobs_for_workflow_run.side_effect = [ - { - "jobs": [ - { - "created_at": job_stats_raw.created_at, - "started_at": job_stats_raw.started_at, - "runner_name": runner_names[i * no_of_jobs_per_page + j], - "conclusion": job_stats_raw.conclusion, - "id": job_stats_raw.id, - } - for j in range(no_of_jobs_per_page) - ] - } - for i in range(no_of_pages) - ] + [{"jobs": []}] - - -def test_get_job_info(github_client: GithubClient, job_stats_raw: JobStatsRawData): - """ - arrange: A mocked Github Client that returns one page of jobs containing one job \ - with the runner. - act: Call get_job_info. - assert: The correct JobStats object is returned. - """ - github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) - job_stats = github_client.get_job_info( - path=github_repo, - workflow_run_id=secrets.token_hex(16), - runner_name=job_stats_raw.runner_name, - ) - assert job_stats == JobStats( - created_at=datetime(2021, 10, 1, 0, 0, 0, tzinfo=timezone.utc), - started_at=datetime(2021, 10, 1, 1, 0, 0, tzinfo=timezone.utc), - runner_name=job_stats_raw.runner_name, - conclusion=JobConclusion.SUCCESS, - job_id=job_stats_raw.id, - ) - - -def test_get_job_info_no_conclusion(github_client: GithubClient, job_stats_raw: JobStatsRawData): - """ - arrange: A mocked Github Client that returns one page of jobs containing one job \ - with the runner with conclusion set to None. - act: Call get_job_info. - assert: JobStats object with conclusion set to None is returned. - """ - github_client._client.actions.list_jobs_for_workflow_run.return_value = { - "jobs": [ - { - "created_at": job_stats_raw.created_at, - "started_at": job_stats_raw.started_at, - "runner_name": job_stats_raw.runner_name, - "conclusion": None, - "id": job_stats_raw.id, - } - ] - } - github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) - job_stats = github_client.get_job_info( - path=github_repo, - workflow_run_id=secrets.token_hex(16), - runner_name=job_stats_raw.runner_name, - ) - assert job_stats == JobStats( - created_at=datetime(2021, 10, 1, 0, 0, 0, tzinfo=timezone.utc), - started_at=datetime(2021, 10, 1, 1, 0, 0, tzinfo=timezone.utc), - runner_name=job_stats_raw.runner_name, - conclusion=None, - job_id=job_stats_raw.id, - ) - - -def test_github_api_pagination_multiple_pages( - github_client: GithubClient, job_stats_raw: JobStatsRawData -): - """ - arrange: A mocked Github Client that returns multiple pages of jobs containing \ - one job with the runner. - act: Call get_job_info. - assert: The correct JobStats object is returned. - """ - _mock_multiple_pages_for_job_response( - github_client=github_client, job_stats_raw=job_stats_raw, include_runner=True - ) - - github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) - job_stats = github_client.get_job_info( - path=github_repo, - workflow_run_id=secrets.token_hex(16), - runner_name=job_stats_raw.runner_name, - ) - assert job_stats == JobStats( - created_at=datetime(2021, 10, 1, 0, 0, 0, tzinfo=timezone.utc), - started_at=datetime(2021, 10, 1, 1, 0, 0, tzinfo=timezone.utc), - runner_name=job_stats_raw.runner_name, - conclusion=JobConclusion.SUCCESS, - job_id=job_stats_raw.id, - ) - - -def test_github_api_pagination_job_not_found( - github_client: GithubClient, job_stats_raw: JobStatsRawData -): - """ - arrange: A mocked Github Client that returns multiple pages of jobs containing \ - no job with the runner. - act: Call get_job_info. - assert: An exception is raised. - """ - _mock_multiple_pages_for_job_response( - github_client=github_client, job_stats_raw=job_stats_raw, include_runner=False - ) - - github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) - - with pytest.raises(JobNotFoundError): - github_client.get_job_info( - path=github_repo, - workflow_run_id=secrets.token_hex(16), - runner_name=job_stats_raw.runner_name, - ) - - -def test_github_api_http_error(github_client: GithubClient, job_stats_raw: JobStatsRawData): - github_client._client.actions.list_jobs_for_workflow_run.side_effect = HTTPError( - "http://test.com", 500, "", http.client.HTTPMessage(), None - ) - github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) - - with pytest.raises(JobNotFoundError): - github_client.get_job_info( - path=github_repo, - workflow_run_id=secrets.token_hex(16), - runner_name=job_stats_raw.runner_name, - ) diff --git a/tests/unit/test_lxd_runner_manager.py b/tests/unit/test_lxd_runner_manager.py index 36c36df11..215cbe7e0 100644 --- a/tests/unit/test_lxd_runner_manager.py +++ b/tests/unit/test_lxd_runner_manager.py @@ -7,26 +7,29 @@ from pathlib import Path from unittest.mock import MagicMock, call +import github_runner_manager.reactive.runner_manager import pytest +from github_runner_manager.metrics.events import ( + Reconciliation, + RunnerInstalled, + RunnerStart, + RunnerStop, +) +from github_runner_manager.metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME +from github_runner_manager.metrics.storage import MetricsStorage +from github_runner_manager.types_.github import GitHubOrg, GitHubRepo, RunnerApplication from pytest import LogCaptureFixture, MonkeyPatch -import reactive.runner_manager import shared_fs from charm_state import ( Arch, CharmConfig, CharmState, - GitHubOrg, - GitHubRepo, ProxyConfig, ReactiveConfig, VirtualMachineResources, ) from errors import IssueMetricEventError, RunnerBinaryError -from github_type import RunnerApplication -from metrics.events import Reconciliation, RunnerInstalled, RunnerStart, RunnerStop -from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME -from metrics.storage import MetricsStorage from runner import Runner, RunnerStatus from runner_manager import BUILD_IMAGE_SCRIPT_FILENAME, LXDRunnerManager, LXDRunnerManagerConfig from runner_type import RunnerNameByHealth @@ -107,7 +110,7 @@ def runner_manager_fixture(request, tmp_path, monkeypatch, token, charm_state): def issue_event_mock_fixture(monkeypatch: MonkeyPatch) -> MagicMock: """Mock the issue_event function.""" issue_event_mock = MagicMock() - monkeypatch.setattr("metrics.events.issue_event", issue_event_mock) + monkeypatch.setattr("github_runner_manager.metrics.events.issue_event", issue_event_mock) return issue_event_mock @@ -131,7 +134,7 @@ def runner_metrics_fixture(monkeypatch: MonkeyPatch) -> MagicMock: @pytest.fixture(name="reactive_reconcile_mock") def reactive_reconcile_fixture(monkeypatch: MonkeyPatch, tmp_path: Path) -> MagicMock: """Mock the job class.""" - reconcile_mock = MagicMock(spec=reactive.runner_manager.reconcile) + reconcile_mock = MagicMock(spec=github_runner_manager.reactive.runner_manager.reconcile) monkeypatch.setattr("runner_manager.reactive_runner_manager.reconcile", reconcile_mock) reconcile_mock.side_effect = lambda quantity, **kwargs: quantity return reconcile_mock diff --git a/tests/unit/test_openstack_cloud.py b/tests/unit/test_openstack_cloud.py deleted file mode 100644 index 4f599e914..000000000 --- a/tests/unit/test_openstack_cloud.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -from pathlib import Path - -import pytest -import yaml - -import openstack_cloud -from errors import OpenStackInvalidConfigError - - -def test_initialize(clouds_yaml_path: Path, clouds_yaml: dict): - """ - arrange: Mocked clouds.yaml data and path. - act: Call initialize. - assert: The clouds.yaml file is written to disk. - """ - openstack_cloud.initialize(clouds_yaml) - - assert yaml.safe_load(clouds_yaml_path.read_text(encoding="utf-8")) == clouds_yaml - - -@pytest.mark.parametrize( - "invalid_yaml, expected_err_msg", - [ - pytest.param( - {"wrong-key": {"cloud_name": {"auth": {}}}}, "Missing key 'clouds' from config." - ), - pytest.param({}, "Missing key 'clouds' from config."), - pytest.param({"clouds": {}}, "No clouds defined in clouds.yaml."), - ], -) -def test_initialize_validation_error(invalid_yaml: dict, expected_err_msg): - """ - arrange: Mocked clouds.yaml data with invalid data. - act: Call initialize. - assert: InvalidConfigError is raised. - """ - with pytest.raises(OpenStackInvalidConfigError) as exc: - openstack_cloud.initialize(invalid_yaml) - assert expected_err_msg in str(exc) diff --git a/tests/unit/test_runner.py b/tests/unit/test_runner.py index af7954d06..e6d57f305 100644 --- a/tests/unit/test_runner.py +++ b/tests/unit/test_runner.py @@ -8,12 +8,14 @@ from pathlib import Path from unittest.mock import MagicMock, call +import github_runner_manager.metrics.runner_logs import jinja2 import pytest from _pytest.monkeypatch import MonkeyPatch +from github_runner_manager.metrics.storage import MetricsStorage +from github_runner_manager.types_.github import GitHubOrg, GitHubRepo -import metrics.runner_logs -from charm_state import GitHubOrg, GitHubRepo, SSHDebugConnection, VirtualMachineResources +from charm_state import SSHDebugConnection, VirtualMachineResources from errors import ( CreateMetricsStorageError, LxdError, @@ -22,7 +24,6 @@ RunnerRemoveError, ) from lxd import LxdInstance, LxdInstanceFileManager -from metrics.storage import MetricsStorage from runner import DIAG_DIR_PATH, CreateRunnerConfig, Runner, RunnerConfig, RunnerStatus from runner_manager_type import RunnerManagerClients from runner_type import ProxySetting @@ -102,7 +103,9 @@ def create_logs_dir(runner_name: str) -> Path: return target_log_path - create_logs_dir_mock = MagicMock(spec=metrics.runner_logs.create_logs_dir) + create_logs_dir_mock = MagicMock( + spec=github_runner_manager.metrics.runner_logs.create_logs_dir + ) create_logs_dir_mock.side_effect = create_logs_dir monkeypatch.setattr("runner.create_logs_dir", create_logs_dir_mock) @@ -522,7 +525,7 @@ def test_pull_logs(runner: Runner, log_dir_base_path: Path): runner.instance.files.pull_file.assert_has_calls( [ call(str(DIAG_DIR_PATH), str(log_dir_path), is_dir=True), - call(str(metrics.runner_logs.SYSLOG_PATH), str(log_dir_path)), + call(str(github_runner_manager.metrics.runner_logs.SYSLOG_PATH), str(log_dir_path)), ] ) diff --git a/tests/unit/test_runner_scaler.py b/tests/unit/test_runner_scaler.py index 845c8da49..f3199fd99 100644 --- a/tests/unit/test_runner_scaler.py +++ b/tests/unit/test_runner_scaler.py @@ -6,12 +6,16 @@ from unittest.mock import MagicMock import pytest +from github_runner_manager.manager.cloud_runner_manager import CloudRunnerState, InstanceId +from github_runner_manager.manager.github_runner_manager import GitHubRunnerState +from github_runner_manager.manager.runner_manager import ( + FlushMode, + RunnerManager, + RunnerManagerConfig, +) +from github_runner_manager.manager.runner_scaler import RunnerScaler +from github_runner_manager.types_.github import GitHubPath, GitHubRepo -from charm_state import GitHubPath, GitHubRepo -from manager.cloud_runner_manager import CloudRunnerState, InstanceId -from manager.github_runner_manager import GitHubRunnerState -from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig -from manager.runner_scaler import RunnerScaler from tests.unit.mock_runner_managers import ( MockCloudRunnerManager, MockGitHubRunnerManager, @@ -58,11 +62,16 @@ def runner_manager_fixture( ) -> RunnerManager: mock_cloud, mock_github = mock_runner_managers monkeypatch.setattr( - "manager.runner_manager.RunnerManager._spawn_runners", mock_runner_manager_spawn_runners + "github_runner_manager.manager.runner_manager.RunnerManager._spawn_runners", + mock_runner_manager_spawn_runners, ) # Patch out the metrics, as metrics has their own tests. - monkeypatch.setattr("manager.runner_manager.github_metrics.job", MagicMock()) - monkeypatch.setattr("manager.runner_manager.runner_metrics.issue_events", MagicMock()) + monkeypatch.setattr( + "github_runner_manager.manager.runner_manager.github_metrics.job", MagicMock() + ) + monkeypatch.setattr( + "github_runner_manager.manager.runner_manager.runner_metrics.issue_events", MagicMock() + ) config = RunnerManagerConfig("mock_token", github_path) runner_manager = RunnerManager("mock_runners", mock_cloud, config) diff --git a/tests/unit/test_shared_fs.py b/tests/unit/test_shared_fs.py index 0c1266566..2a21bf3cc 100644 --- a/tests/unit/test_shared_fs.py +++ b/tests/unit/test_shared_fs.py @@ -7,6 +7,7 @@ import pytest from _pytest.monkeypatch import MonkeyPatch +from github_runner_manager.metrics.storage import MetricsStorage import shared_fs from errors import ( @@ -15,7 +16,6 @@ GetMetricsStorageError, SubprocessError, ) -from metrics.storage import MetricsStorage MOUNTPOINT_FAILURE_EXIT_CODE = 1