From a6c526a9b7c109d8a581ed059b05411796509d0b Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Wed, 4 Sep 2024 11:12:37 +0200 Subject: [PATCH 01/10] chore(deps): update dependency cryptography to <=43.0.1 (#361) Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9ef53de2b..541c0d4c9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ ops>=2.8 pylxd @ git+https://github.com/canonical/pylxd requests typing-extensions -cryptography <=43.0.0 +cryptography <=43.0.1 pydantic ==1.10.17 cosl ==0.0.15 # juju 3.1.2.0 depends on pyyaml<=6.0 and >=5.1.2 From cb1b70e7fccbd3898819c6e1f722dc1880a4e632 Mon Sep 17 00:00:00 2001 From: Yanks Yoon <37652070+yanksyoon@users.noreply.github.com> Date: Thu, 5 Sep 2024 12:13:03 +0800 Subject: [PATCH 02/10] fix: add image relation joined framework register (#362) --- src/charm.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/charm.py b/src/charm.py index a61586b9b..ee1659933 100755 --- a/src/charm.py +++ b/src/charm.py @@ -246,6 +246,10 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self.on[DEBUG_SSH_INTEGRATION_NAME].relation_changed, self._on_debug_ssh_relation_changed, ) + self.framework.observe( + self.on[IMAGE_INTEGRATION_NAME].relation_joined, + self._on_image_relation_joined, + ) self.framework.observe( self.on[IMAGE_INTEGRATION_NAME].relation_changed, self._on_image_relation_changed, @@ -1178,7 +1182,7 @@ def _on_image_relation_joined(self, _: ops.RelationJoinedEvent) -> None: cloud = list(clouds_yaml["clouds"].keys())[0] auth_map = clouds_yaml["clouds"][cloud]["auth"] for relation in self.model.relations[IMAGE_INTEGRATION_NAME]: - relation.data[self.model.unit].update(auth_map) + relation.data[self.unit].update(auth_map) @catch_charm_errors def _on_image_relation_changed(self, _: ops.RelationChangedEvent) -> None: From 5b8c99a5355570917c0e4fc7a281af81540c1edc Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 5 Sep 2024 15:23:57 +0800 Subject: [PATCH 03/10] Use the refactored runner manager (#351) * Patch runner log path in tests * Add missing fixture * Fix the scope of fixture * Fix tmp_path scope issue * Fix monkeypatch fixture scope issue * Add patch of metric log path * Fix return type of create_runners * Add health check test * Fix arg naming * Add debug statement * Add more debug statement * Move debug statement * Merge tests * Handle openstack errors with delete runner * Fix delete server * Fix test variable reference * Fix OpenstackInstance creation * Add some docstrings * Fix args issues with RunnerInstance * Add more docs * Fix GithubRunnerState construction * Fix instance-id parsing from full name * Add delete idle runner test. * Add busy flush to test * Spawn a manual test env * Disable spawning on manual test env * Remove useless class * Fix runner deletion * Fix import error * Add more docs * Fix get no-existing openstack server * Add debug statement * Fix variable name and function name mixup * Fix id variable name, function name mixup * Add debug statement * Move debug * Add busy runner test * Add debug statement. * Disable some test * Disable some test * Fix runner label in workflow * Fix lambda * Debug * Debug * Add debug * Start new manual test env * Add none check * Fix missing prefix * Add more logging * Refactor runner manager one runner fixture * Fix error string formatting * Adding the docstring for github_runner_manager * Fix test fixture scope * Add docstring on cloud_runner_manager * Add debug * Fix docstring for cloud runner manager * Add more docstrings * Add metrics for deleted and cleanup runners * Enable tests again * Add debug * Get runner info not on GitHub * Fix dict access * Add debug of userdata * Fix metric path * Debug metric * Fix variable naming * Test * Fix iterator * Debug * Debug * Fix for iterator return value * Add more log path patching * Fix path naming * Fix monkey patch * Start a arm64 manual test env * Not spawning manual test env * Update fmt * Fix metric storage implementation for openstack * Fix metric storage provider usage in openstack runner manager * Debug * Fix iterator * Add debug * Fix None in iterator * Add debug * Trying fix for get runner filter * Add test * Patch the path for logs * Add cleanup test * Debug * Fix github state determining busy runner * Fix wrong naming for method in ABC * Remove debugging * Add more docstrings * Fix runner deletion * Add more docs * Fix typing * Debug * Update SSH health check * Tmp disable a passing test * Add deubg * Remove a debug * Fix Cloud runner state init * Change clean up to cleanup * Fix attr naming issue in openstack cloud * Fix reference to non-existing instance_name in openstack cloud * Add metric log processing to test * Enable all tests * Fix health check return value * Fix all flake8 lints * Fix test * Fix all lints * Fix unit test issue due to method sig change * Ignore openstack cloud from coverage due to the test requires private endpoint * Enable all tests * Remove a repeated test * Re-enable test.yaml * Fix integration tests workflwo * Add docs on cleanup method of cloud runner manager * Add parallel spawning of runners. * Enable dev testing * Fix parallel spawn * Allow openstack server to take a bit of time on deletion * Refactor test detection of no runners * Re-enable the tests * Fix lints * Disable tests again * Disable some test * Add wait until runner is running * Enable openstack runner manager tests * Add debug * Wait for github state * Refactor wait until runner spawn * Add keyfile erorr * Remove debug statement * Re-enable all tests * Update src/manager/github_runner_manager.py Co-authored-by: Yanks Yoon <37652070+yanksyoon@users.noreply.github.com> * Update src/openstack_cloud/openstack_cloud.py Co-authored-by: Yanks Yoon <37652070+yanksyoon@users.noreply.github.com> * Suggestions * Refactor remove openstack server * Test spawning two runners. * Fix test * Fix naming * Fix according comment * Fix clouds yaml write issue. * Fix format * Add delete runner by amount * Add getting runner health state for metrics * Fix security group ID issues * Fix according to review * Refactor health state for runner * Fix lint issues * Add missing docs * Update the github state enum to use auto * Rename class to fit convension * Fix according to review * Fix name_prefix property cloud runner manager * Add class for scaling runners * Fix lints * Fix unit test * Fix according to review comment * Fix test according to comments * Fix unit test * Fix typo of attr * Add debug * Add debug statement * Debug * Fix return code of the kill command * Remove debug * Add comments on the flush kill command * Add debug * Fix debug * Debug * Debug * Remove debug * Add cleanup during idle and busy runner test * Debug * Disable tests during debug * Debug missing keyfiles * Fix keyfile path matching issue * testing * debug * Add debug * Use OR * debug * Debug * Debug * Debug * Debug * Debug * Fix flush mode * Remove debug * Re-enable all tests * Initial unit test for runner scaler * Add more unit tests for runner scaler * Add more tests * Fix merge issues * Fix states in get_runners methods * Add docstring for unit test mocks * Fix construction of repo-policy-compliance from config * Fix get_runners action output * Fix the lints * Fix a naming issue * Fix naming prefix of runner * Improve unit test * Remove the old OpenstackRunnerManager * Fix test contstruction of runner manager. * Fix flavor naming * Fix flush action result output. * Fix flavor of metric * Testing out a integration test fikx * change flush runner to flush idle. * Add debug in integration test * Manual test mode * Start new manual test env * Spawn x64 manual test env. * Improve logging during reconcile * Fix crashed metric collection * Remove debug workflow * Format * Test * Add reactive back in * Fix flushing of runners * Debug workflow * Add debug * Fix logging of health state * Remove debug * Debug * Fix set contruction * Fix SSH key path in integration test setup * Add more checks to repo-policy-compliance setup in tests * Fix key path check * Fix format string issue * Fix format string typo * Add some logging of test setup * Fix missing await * Revert config-change flushing * Add maintance status for image relation change * Fix HTTP format * Update coverage ignore of github_runner_manager * Minor fix in test comments --------- Co-authored-by: Yanks Yoon <37652070+yanksyoon@users.noreply.github.com> Co-authored-by: Christopher Bartz --- pyproject.toml | 2 +- src-docs/charm.md | 10 +- src-docs/errors.md | 65 +- src-docs/github_client.md | 12 +- src-docs/openstack_cloud.md | 5 +- src-docs/openstack_cloud.openstack_manager.md | 4 +- ...penstack_cloud.openstack_runner_manager.md | 29 +- src-docs/runner_manager.md | 28 +- src-docs/runner_manager_type.md | 8 +- src-docs/runner_type.md | 2 +- src/charm.py | 193 +- src/charm_state.py | 16 +- src/errors.py | 4 + src/github_client.py | 34 +- src/manager/cloud_runner_manager.py | 15 +- src/manager/github_runner_manager.py | 32 +- src/manager/runner_manager.py | 71 +- src/manager/runner_scaler.py | 215 +++ src/metrics/github.py | 4 +- src/openstack_cloud/openstack_manager.py | 1598 ----------------- .../openstack_runner_manager.py | 55 +- src/runner.py | 4 +- src/runner_manager.py | 37 +- src/runner_manager_type.py | 10 +- src/runner_type.py | 4 +- tests/integration/helpers/common.py | 8 +- tests/integration/helpers/openstack.py | 61 +- .../test_charm_scheduled_events.py | 4 +- .../test_runner_manager_openstack.py | 27 +- tests/integration/test_self_hosted_runner.py | 4 +- tests/unit/conftest.py | 10 +- tests/unit/mock_runner_managers.py | 294 +++ tests/unit/test_charm.py | 60 +- tests/unit/test_charm_state.py | 16 +- tests/unit/test_github_client.py | 12 +- ..._manager.py => test_lxd_runner_manager.py} | 56 +- tests/unit/test_openstack_manager.py | 1200 ------------- tests/unit/test_runner.py | 6 +- tests/unit/test_runner_scaler.py | 266 +++ 39 files changed, 1292 insertions(+), 3189 deletions(-) create mode 100644 src/manager/runner_scaler.py delete mode 100644 src/openstack_cloud/openstack_manager.py create mode 100644 tests/unit/mock_runner_managers.py rename tests/unit/{test_runner_manager.py => test_lxd_runner_manager.py} (91%) delete mode 100644 tests/unit/test_openstack_manager.py create mode 100644 tests/unit/test_runner_scaler.py diff --git a/pyproject.toml b/pyproject.toml index f4a49bd2a..d16bac3a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ omit = [ ] [tool.coverage.report] -fail_under = 83 +fail_under = 85 show_missing = true [tool.pytest.ini_options] diff --git a/src-docs/charm.md b/src-docs/charm.md index c03d1411b..9fd2aac04 100644 --- a/src-docs/charm.md +++ b/src-docs/charm.md @@ -20,7 +20,7 @@ Charm for creating and managing GitHub self-hosted runner instances. --- - + ## function `catch_charm_errors` @@ -46,7 +46,7 @@ Catch common errors in charm. --- - + ## function `catch_action_errors` @@ -72,7 +72,7 @@ Catch common errors in actions. --- - + ## class `ReconcileRunnersEvent` Event representing a periodic check to ensure runners are ok. @@ -83,7 +83,7 @@ Event representing a periodic check to ensure runners are ok. --- - + ## class `GithubRunnerCharm` Charm for managing GitHub self-hosted runners. @@ -100,7 +100,7 @@ Charm for managing GitHub self-hosted runners. - `ram_pool_path`: The path to memdisk storage. - `kernel_module_path`: The path to kernel modules. - + ### method `__init__` diff --git a/src-docs/errors.md b/src-docs/errors.md index cf7cde565..ee5db5a11 100644 --- a/src-docs/errors.md +++ b/src-docs/errors.md @@ -99,6 +99,17 @@ Error for setting up aproxy. +## class `MissingServerConfigError` +Error for unable to create runner due to missing server configurations. + + + + + +--- + + + ## class `MissingRunnerBinaryError` Error for missing runner binary. @@ -108,7 +119,7 @@ Error for missing runner binary. --- - + ## class `ConfigurationError` Error for juju configuration. @@ -119,7 +130,7 @@ Error for juju configuration. --- - + ## class `MissingMongoDBError` Error for missing integration data. @@ -130,7 +141,7 @@ Error for missing integration data. --- - + ## class `LxdError` Error for executing LXD actions. @@ -141,7 +152,7 @@ Error for executing LXD actions. --- - + ## class `SubprocessError` Error for Subprocess calls. @@ -155,7 +166,7 @@ Error for Subprocess calls. - `stdout`: Content of stdout of the subprocess. - `stderr`: Content of stderr of the subprocess. - + ### method `__init__` @@ -185,7 +196,7 @@ Construct the subprocess error. --- - + ## class `IssueMetricEventError` Represents an error when issuing a metric event. @@ -196,7 +207,7 @@ Represents an error when issuing a metric event. --- - + ## class `LogrotateSetupError` Represents an error raised when logrotate cannot be setup. @@ -207,7 +218,7 @@ Represents an error raised when logrotate cannot be setup. --- - + ## class `MetricsStorageError` Base class for all metrics storage errors. @@ -218,7 +229,7 @@ Base class for all metrics storage errors. --- - + ## class `SharedFilesystemError` Base class for all shared filesystem errors. @@ -229,7 +240,7 @@ Base class for all shared filesystem errors. --- - + ## class `CreateMetricsStorageError` Represents an error when the metrics storage could not be created. @@ -240,7 +251,7 @@ Represents an error when the metrics storage could not be created. --- - + ## class `DeleteMetricsStorageError` Represents an error when the metrics storage could not be deleted. @@ -251,7 +262,7 @@ Represents an error when the metrics storage could not be deleted. --- - + ## class `GetMetricsStorageError` Represents an error when the metrics storage could not be retrieved. @@ -262,7 +273,7 @@ Represents an error when the metrics storage could not be retrieved. --- - + ## class `QuarantineMetricsStorageError` Represents an error when the metrics storage could not be quarantined. @@ -273,7 +284,7 @@ Represents an error when the metrics storage could not be quarantined. --- - + ## class `SharedFilesystemMountError` Represents an error related to the mounting of the shared filesystem. @@ -284,7 +295,7 @@ Represents an error related to the mounting of the shared filesystem. --- - + ## class `RunnerMetricsError` Base class for all runner metrics errors. @@ -295,7 +306,7 @@ Base class for all runner metrics errors. --- - + ## class `CorruptMetricDataError` Represents an error with the data being corrupt. @@ -306,7 +317,7 @@ Represents an error with the data being corrupt. --- - + ## class `GithubMetricsError` Base class for all github metrics errors. @@ -317,7 +328,7 @@ Base class for all github metrics errors. --- - + ## class `GithubClientError` Base class for all github client errors. @@ -328,7 +339,7 @@ Base class for all github client errors. --- - + ## class `GithubApiError` Represents an error when the GitHub API returns an error. @@ -339,7 +350,7 @@ Represents an error when the GitHub API returns an error. --- - + ## class `TokenError` Represents an error when the token is invalid or has not enough permissions. @@ -350,7 +361,7 @@ Represents an error when the token is invalid or has not enough permissions. --- - + ## class `JobNotFoundError` Represents an error when the job could not be found on GitHub. @@ -361,7 +372,7 @@ Represents an error when the job could not be found on GitHub. --- - + ## class `RunnerLogsError` Base class for all runner logs errors. @@ -372,7 +383,7 @@ Base class for all runner logs errors. --- - + ## class `OpenStackError` Base class for OpenStack errors. @@ -383,7 +394,7 @@ Base class for OpenStack errors. --- - + ## class `OpenStackInvalidConfigError` Represents an invalid OpenStack configuration. @@ -394,7 +405,7 @@ Represents an invalid OpenStack configuration. --- - + ## class `OpenStackUnauthorizedError` Represents an unauthorized connection to OpenStack. @@ -405,7 +416,7 @@ Represents an unauthorized connection to OpenStack. --- - + ## class `SSHError` Represents an error while interacting with SSH. @@ -416,7 +427,7 @@ Represents an error while interacting with SSH. --- - + ## class `KeyfileError` Represents missing keyfile for SSH. diff --git a/src-docs/github_client.md b/src-docs/github_client.md index 6cd298c52..fc0de8f7b 100644 --- a/src-docs/github_client.md +++ b/src-docs/github_client.md @@ -67,7 +67,7 @@ Instantiate the GiHub API client. ### method `delete_runner` ```python -delete_runner(path: GithubOrg | GithubRepo, runner_id: int) → None +delete_runner(path: GitHubOrg | GitHubRepo, runner_id: int) → None ``` Delete the self-hosted runner from GitHub. @@ -87,7 +87,7 @@ Delete the self-hosted runner from GitHub. ```python get_job_info( - path: GithubRepo, + path: GitHubRepo, workflow_run_id: str, runner_name: str ) → JobStats @@ -123,7 +123,7 @@ Get information about a job for a specific workflow run. ```python get_runner_application( - path: GithubOrg | GithubRepo, + path: GitHubOrg | GitHubRepo, arch: Arch, os: str = 'linux' ) → RunnerApplication @@ -157,7 +157,7 @@ Get runner application available for download for given arch. ### method `get_runner_github_info` ```python -get_runner_github_info(path: GithubOrg | GithubRepo) → list[SelfHostedRunner] +get_runner_github_info(path: GitHubOrg | GitHubRepo) → list[SelfHostedRunner] ``` Get runner information on GitHub under a repo or org. @@ -180,7 +180,7 @@ Get runner information on GitHub under a repo or org. ### method `get_runner_registration_token` ```python -get_runner_registration_token(path: GithubOrg | GithubRepo) → str +get_runner_registration_token(path: GitHubOrg | GitHubRepo) → str ``` Get token from GitHub used for registering runners. @@ -203,7 +203,7 @@ Get token from GitHub used for registering runners. ### method `get_runner_remove_token` ```python -get_runner_remove_token(path: GithubOrg | GithubRepo) → str +get_runner_remove_token(path: GitHubOrg | GitHubRepo) → str ``` Get token from GitHub used for removing runners. diff --git a/src-docs/openstack_cloud.md b/src-docs/openstack_cloud.md index 4d82f5359..34aa3f26f 100644 --- a/src-docs/openstack_cloud.md +++ b/src-docs/openstack_cloud.md @@ -7,7 +7,10 @@ Module for managing Openstack cloud. **Global Variables** --------------- -- **openstack_manager**: # Copyright 2024 Canonical Ltd. +- **openstack_cloud**: # Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +- **openstack_runner_manager**: # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. diff --git a/src-docs/openstack_cloud.openstack_manager.md b/src-docs/openstack_cloud.openstack_manager.md index a0f0a2531..115eec05b 100644 --- a/src-docs/openstack_cloud.openstack_manager.md +++ b/src-docs/openstack_cloud.openstack_manager.md @@ -27,7 +27,7 @@ create_instance_config( app_name: str, unit_num: int, image_id: str, - path: GithubOrg | GithubRepo, + path: GitHubOrg | GitHubRepo, labels: Iterable[str], registration_token: str ) → InstanceConfig @@ -75,7 +75,7 @@ The configuration values for creating a single runner instance. ```python __init__( - github_path: GithubOrg | GithubRepo, + github_path: GitHubOrg | GitHubRepo, image_id: str, labels: Iterable[str], name: str, diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 752e2f9d3..7f28b8689 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -17,7 +17,7 @@ Manager for self-hosted runner on OpenStack. --- - + ## class `OpenStackCloudConfig` Configuration for OpenStack cloud authorisation information. @@ -47,7 +47,7 @@ __init__(clouds_config: dict[str, dict], cloud: str) → None --- - + ## class `OpenStackServerConfig` Configuration for OpenStack server. @@ -78,9 +78,9 @@ __init__(image: str, flavor: str, network: str) → None --- - + -## class `OpenstackRunnerManager` +## class `OpenStackRunnerManager` Manage self-hosted runner on OpenStack cloud. @@ -89,15 +89,16 @@ Manage self-hosted runner on OpenStack cloud. - `name_prefix`: The name prefix of the runners created. - + ### method `__init__` ```python __init__( + manager_name: str, prefix: str, cloud_config: OpenStackCloudConfig, - server_config: OpenStackServerConfig, + server_config: OpenStackServerConfig | None, runner_config: GitHubRunnerConfig, service_config: SupportServiceConfig ) → None @@ -109,9 +110,10 @@ Construct the object. **Args:** + - `manager_name`: A name to identify this manager. - `prefix`: The prefix to runner name. - `cloud_config`: The configuration for OpenStack authorisation. - - `server_config`: The configuration for creating OpenStack server. + - `server_config`: The configuration for creating OpenStack server. Unable to create runner if None. - `runner_config`: The configuration for the runner. - `service_config`: The configuration of supporting services of the runners. @@ -131,7 +133,7 @@ The prefix of runner names. --- - + ### method `cleanup` @@ -154,7 +156,7 @@ Cleanup runner and resource on the cloud. --- - + ### method `create_runner` @@ -174,6 +176,7 @@ Create a self-hosted runner. **Raises:** + - `MissingServerConfigError`: Unable to create runner due to missing configuration. - `RunnerCreateError`: Unable to create runner due to OpenStack issues. @@ -183,7 +186,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -207,7 +210,7 @@ Delete self-hosted runners. --- - + ### method `flush_runners` @@ -230,7 +233,7 @@ Remove idle and/or busy runners. --- - + ### method `get_runner` @@ -253,7 +256,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src-docs/runner_manager.md b/src-docs/runner_manager.md index 8d1773bf0..f52829efa 100644 --- a/src-docs/runner_manager.md +++ b/src-docs/runner_manager.md @@ -13,9 +13,9 @@ Runner Manager manages the runners on LXD and GitHub. --- - + -## class `RunnerManager` +## class `LXDRunnerManager` Manage a group of runners according to configuration. @@ -25,7 +25,7 @@ Manage a group of runners according to configuration. - `runner_bin_path`: The github runner app scripts path. - `cron_path`: The path to runner build image cron job. - + ### method `__init__` @@ -33,7 +33,7 @@ Manage a group of runners according to configuration. __init__( app_name: str, unit: int, - runner_manager_config: RunnerManagerConfig + runner_manager_config: LXDRunnerManagerConfig ) → None ``` @@ -52,7 +52,7 @@ Construct RunnerManager object for creating and managing runners. --- - + ### method `build_runner_image` @@ -72,7 +72,7 @@ Build container image in test mode, else virtual machine image. --- - + ### method `check_runner_bin` @@ -89,12 +89,12 @@ Check if runner binary exists. --- - + ### method `flush` ```python -flush(mode: FlushMode = ) → int +flush(mode: LXDFlushMode = ) → int ``` Remove existing runners. @@ -118,7 +118,7 @@ Remove existing runners. --- - + ### method `get_github_info` @@ -135,7 +135,7 @@ Get information on the runners from GitHub. --- - + ### method `get_latest_runner_bin_url` @@ -166,7 +166,7 @@ The runner binary URL changes when a new version is available. --- - + ### method `has_runner_image` @@ -183,7 +183,7 @@ Check if the runner image exists. --- - + ### method `reconcile` @@ -207,7 +207,7 @@ Bring runners in line with target. --- - + ### method `schedule_build_runner_image` @@ -219,7 +219,7 @@ Install cron job for building runner image. --- - + ### method `update_runner_bin` diff --git a/src-docs/runner_manager_type.md b/src-docs/runner_manager_type.md index f6dd4faae..cd7eaf5d2 100644 --- a/src-docs/runner_manager_type.md +++ b/src-docs/runner_manager_type.md @@ -11,7 +11,7 @@ Types used by RunnerManager class. -## class `FlushMode` +## class `LXDFlushMode` Strategy for flushing runners. During pre-job (repo-check), the runners are marked as idle and if the pre-job fails, the runner falls back to being idle again. Hence wait_repo_check is required. @@ -71,7 +71,7 @@ __init__( -## class `RunnerManagerConfig` +## class `LXDRunnerManagerConfig` Configuration of runner manager. @@ -97,7 +97,7 @@ __init__( charm_state: CharmState, image: str, lxd_storage_path: Path, - path: GithubOrg | GithubRepo, + path: GitHubOrg | GitHubRepo, service_token: str, token: str, dockerhub_mirror: str | None = None, @@ -147,7 +147,7 @@ Configuration of runner manager. ```python __init__( charm_state: CharmState, - path: GithubOrg | GithubRepo, + path: GitHubOrg | GitHubRepo, labels: Iterable[str], token: str, flavor: str, diff --git a/src-docs/runner_type.md b/src-docs/runner_type.md index 8c9db658a..d5029f4f8 100644 --- a/src-docs/runner_type.md +++ b/src-docs/runner_type.md @@ -106,7 +106,7 @@ __init__( labels: tuple[str], lxd_storage_path: Path, name: str, - path: GithubOrg | GithubRepo, + path: GitHubOrg | GitHubRepo, proxies: ProxySetting, dockerhub_mirror: str | None = None, ssh_debug_connections: list[SSHDebugConnection] | None = None diff --git a/src/charm.py b/src/charm.py index ee1659933..c60c62bea 100755 --- a/src/charm.py +++ b/src/charm.py @@ -8,6 +8,9 @@ """Charm for creating and managing GitHub self-hosted runner instances.""" +from manager.cloud_runner_manager import GitHubRunnerConfig, SupportServiceConfig +from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig +from manager.runner_scaler import RunnerScaler from utilities import bytes_with_unit_to_kib, execute_command, remove_residual_venv_dirs, retry # This is a workaround for https://bugs.launchpad.net/juju/+bug/2058335 @@ -56,7 +59,7 @@ TOKEN_CONFIG_NAME, CharmConfigInvalidError, CharmState, - GithubPath, + GitHubPath, InstanceType, OpenstackImage, ProxyConfig, @@ -78,10 +81,14 @@ from event_timer import EventTimer, TimerStatusError from firewall import Firewall, FirewallEntry from github_type import GitHubRunnerStatus -from openstack_cloud.openstack_manager import OpenstackRunnerManager +from openstack_cloud.openstack_runner_manager import ( + OpenStackCloudConfig, + OpenStackRunnerManager, + OpenStackServerConfig, +) from runner import LXD_PROFILE_YAML -from runner_manager import RunnerManager, RunnerManagerConfig -from runner_manager_type import FlushMode, OpenstackRunnerManagerConfig +from runner_manager import LXDRunnerManager, LXDRunnerManagerConfig +from runner_manager_type import LXDFlushMode RECONCILE_RUNNERS_EVENT = "reconcile-runners" @@ -369,8 +376,8 @@ def _ensure_service_health(self) -> None: raise def _get_runner_manager( - self, state: CharmState, token: str | None = None, path: GithubPath | None = None - ) -> RunnerManager: + self, state: CharmState, token: str | None = None, path: GitHubPath | None = None + ) -> LXDRunnerManager: """Get a RunnerManager instance. Args: @@ -403,10 +410,10 @@ def _get_runner_manager( app_name, unit = self.unit.name.rsplit("/", 1) - return RunnerManager( + return LXDRunnerManager( app_name, unit, - RunnerManagerConfig( + LXDRunnerManagerConfig( charm_state=state, dockerhub_mirror=state.charm_config.dockerhub_mirror, image=state.runner_config.base_image.value, @@ -497,10 +504,11 @@ def _on_start(self, _: StartEvent) -> None: state = self._setup_state() if state.instance_type == InstanceType.OPENSTACK: + self.unit.status = MaintenanceStatus("Starting runners") if not self._get_set_image_ready_status(): return - openstack_runner_manager = self._get_openstack_runner_manager(state) - openstack_runner_manager.reconcile(state.runner_config.virtual_machines) + runner_scaler = self._get_runner_scaler(state) + runner_scaler.reconcile(state.runner_config.virtual_machines) self.unit.status = ActiveStatus() return @@ -512,7 +520,7 @@ def _on_start(self, _: StartEvent) -> None: self.unit.status = MaintenanceStatus("Starting runners") try: - runner_manager.flush(FlushMode.FLUSH_IDLE) + runner_manager.flush(LXDFlushMode.FLUSH_IDLE) self._reconcile_runners( runner_manager, state.runner_config.virtual_machines, @@ -578,7 +586,7 @@ def _on_upgrade_charm(self, _: UpgradeCharmEvent) -> None: runner_manager = self._get_runner_manager(state) logger.info("Flushing the runners...") - runner_manager.flush(FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK) + runner_manager.flush(LXDFlushMode.FLUSH_BUSY_WAIT_REPO_CHECK) self._reconcile_runners( runner_manager, state.runner_config.virtual_machines, @@ -614,7 +622,7 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None: # noqa: C901 if prev_runner_manager: self.unit.status = MaintenanceStatus("Removing runners due to config change") # Flush runner in case the prev token has expired. - prev_runner_manager.flush(FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK) + prev_runner_manager.flush(LXDFlushMode.FORCE_FLUSH_WAIT_REPO_CHECK) state = self._setup_state() @@ -622,9 +630,9 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None: # noqa: C901 if not self._get_set_image_ready_status(): return if state.charm_config.token != self._stored.token: - openstack_runner_manager = self._get_openstack_runner_manager(state) - openstack_runner_manager.flush() - openstack_runner_manager.reconcile(state.runner_config.virtual_machines) + runner_scaler = self._get_runner_scaler(state) + runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) + runner_scaler.reconcile(state.runner_config.virtual_machines) # TODO: 2024-04-12: Flush on token changes. self.unit.status = ActiveStatus() return @@ -633,7 +641,7 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None: # noqa: C901 runner_manager = self._get_runner_manager(state) if state.charm_config.token != self._stored.token: - runner_manager.flush(FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK) + runner_manager.flush(LXDFlushMode.FORCE_FLUSH_WAIT_REPO_CHECK) self._stored.token = state.charm_config.token self._reconcile_runners( runner_manager, @@ -643,7 +651,7 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None: # noqa: C901 self.unit.status = ActiveStatus() def _check_and_update_local_lxd_dependencies( - self, runner_manager: RunnerManager, token: str, proxy_config: ProxyConfig + self, runner_manager: LXDRunnerManager, token: str, proxy_config: ProxyConfig ) -> bool: """Check and update runner binary and services for local LXD runners. @@ -694,7 +702,7 @@ def _check_and_update_local_lxd_dependencies( runner_bin_updated, ) self.unit.status = MaintenanceStatus("Flushing runners due to updated deps") - runner_manager.flush(FlushMode.FLUSH_IDLE_WAIT_REPO_CHECK) + runner_manager.flush(LXDFlushMode.FLUSH_IDLE_WAIT_REPO_CHECK) self._start_services(token, proxy_config) self.unit.status = ActiveStatus() @@ -723,8 +731,8 @@ def _trigger_reconciliation(self) -> None: if state.instance_type == InstanceType.OPENSTACK: if not self._get_set_image_ready_status(): return - runner_manager = self._get_openstack_runner_manager(state) - runner_manager.reconcile(state.runner_config.virtual_machines) + runner_scaler = self._get_runner_scaler(state) + runner_scaler.reconcile(state.runner_config.virtual_machines) self.unit.status = ActiveStatus() return @@ -761,21 +769,16 @@ def _on_check_runners_action(self, event: ActionEvent) -> None: state = self._setup_state() if state.instance_type == InstanceType.OPENSTACK: - openstack_runner_manager = self._get_openstack_runner_manager(state) - runner_info = openstack_runner_manager.get_github_runner_info() - - for info in runner_info: - if info.online: - online += 1 - runner_names.append(info.runner_name) - else: - offline += 1 + runner_scaler = self._get_runner_scaler(state) + info = runner_scaler.get_runner_info() event.set_results( { - "online": online, - "offline": offline, - "unknown": unknown, - "runners": ", ".join(runner_names), + "online": info.online, + "busy": info.busy, + "offline": info.offline, + "unknown": info.unknown, + "runners": info.runners, + "busy-runners": info.busy_runners, } ) return @@ -818,9 +821,9 @@ def _on_reconcile_runners_action(self, event: ActionEvent) -> None: if not self._get_set_image_ready_status(): event.fail("Openstack image not yet provided/ready.") return - runner_manager = self._get_openstack_runner_manager(state) + runner_scaler = self._get_runner_scaler(state) - delta = runner_manager.reconcile(state.runner_config.virtual_machines) + delta = runner_scaler.reconcile(state.runner_config.virtual_machines) self.unit.status = ActiveStatus() event.set_results({"delta": {"virtual-machines": delta}}) return @@ -851,22 +854,22 @@ def _on_flush_runners_action(self, event: ActionEvent) -> None: if state.instance_type == InstanceType.OPENSTACK: # Flushing mode not implemented for OpenStack yet. - runner_manager = self._get_openstack_runner_manager(state) - flushed = runner_manager.flush() - event.set_results({"delta": {"virtual-machines": flushed}}) + runner_scaler = self._get_runner_scaler(state) + flushed = runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) + logger.info("Flushed %s runners", flushed) + delta = runner_scaler.reconcile(state.runner_config.virtual_machines) + event.set_results({"delta": {"virtual-machines": delta}}) return runner_manager = self._get_runner_manager(state) - runner_manager.flush(FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK) + runner_manager.flush(LXDFlushMode.FLUSH_BUSY_WAIT_REPO_CHECK) delta = self._reconcile_runners( runner_manager, state.runner_config.virtual_machines, state.runner_config.virtual_machine_resources, ) - - self._on_check_runners_action(event) - event.set_results(delta) + event.set_results({"delta": {"virtual-machines": delta}}) @catch_action_errors def _on_update_dependencies_action(self, event: ActionEvent) -> None: @@ -899,15 +902,15 @@ def _on_stop(self, _: StopEvent) -> None: state = self._setup_state() if state.instance_type == InstanceType.OPENSTACK: - runner_manager = self._get_openstack_runner_manager(state) - runner_manager.flush() + runner_scaler = self._get_runner_scaler(state) + runner_scaler.flush() return runner_manager = self._get_runner_manager(state) - runner_manager.flush(FlushMode.FLUSH_BUSY) + runner_manager.flush(LXDFlushMode.FLUSH_BUSY) def _reconcile_runners( - self, runner_manager: RunnerManager, num: int, resources: VirtualMachineResources + self, runner_manager: LXDRunnerManager, num: int, resources: VirtualMachineResources ) -> Dict[str, Any]: """Reconcile the current runners state and intended runner state. @@ -922,7 +925,7 @@ def _reconcile_runners( Returns: Changes in runner number due to reconciling runners. """ - if not RunnerManager.runner_bin_path.is_file(): + if not LXDRunnerManager.runner_bin_path.is_file(): logger.warning("Unable to reconcile due to missing runner binary") raise MissingRunnerBinaryError("Runner binary not found.") @@ -1152,15 +1155,15 @@ def _on_debug_ssh_relation_changed(self, _: ops.RelationChangedEvent) -> None: if state.instance_type == InstanceType.OPENSTACK: if not self._get_set_image_ready_status(): return - runner_manager = self._get_openstack_runner_manager(state) + runner_scaler = self._get_runner_scaler(state) # TODO: 2024-04-12: Should be flush idle. - runner_manager.flush() - runner_manager.reconcile(state.runner_config.virtual_machines) + runner_scaler.flush() + runner_scaler.reconcile(state.runner_config.virtual_machines) return self._refresh_firewall(state) runner_manager = self._get_runner_manager(state) - runner_manager.flush(FlushMode.FLUSH_IDLE) + runner_manager.flush(LXDFlushMode.FLUSH_IDLE) self._reconcile_runners( runner_manager, state.runner_config.virtual_machines, @@ -1188,6 +1191,7 @@ def _on_image_relation_joined(self, _: ops.RelationJoinedEvent) -> None: def _on_image_relation_changed(self, _: ops.RelationChangedEvent) -> None: """Handle image relation changed event.""" state = self._setup_state() + self.unit.status = MaintenanceStatus("Update image for runners") if state.instance_type != InstanceType.OPENSTACK: self.unit.status = BlockedStatus( @@ -1197,10 +1201,9 @@ def _on_image_relation_changed(self, _: ops.RelationChangedEvent) -> None: if not self._get_set_image_ready_status(): return - runner_manager = self._get_openstack_runner_manager(state) - # TODO: 2024-04-12: Should be flush idle. - runner_manager.flush() - runner_manager.reconcile(state.runner_config.virtual_machines) + runner_scaler = self._get_runner_scaler(state) + runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) + runner_scaler.reconcile(state.runner_config.virtual_machines) self.unit.status = ActiveStatus() return @@ -1219,13 +1222,10 @@ def _get_set_image_ready_status(self) -> bool: return False return True - def _get_openstack_runner_manager( - self, state: CharmState, token: str | None = None, path: GithubPath | None = None - ) -> OpenstackRunnerManager: - """Get OpenstackRunnerManager instance. - - TODO: 2024-07-09 Combine this with `_get_runner_manager` during the runner manager \ - interface refactor. + def _get_runner_scaler( + self, state: CharmState, token: str | None = None, path: GitHubPath | None = None + ) -> RunnerScaler: + """Get runner scaler instance for scaling runners. Args: state: Charm state. @@ -1235,39 +1235,62 @@ def _get_openstack_runner_manager( name. If None the path in charm state is used. Returns: - An instance of OpenstackRunnerManager. + An instance of RunnerScaler. """ if token is None: token = state.charm_config.token if path is None: path = state.charm_config.path - # Empty image can be passed down due to a delete only case where deletion of runners do not - # depend on the image ID being available. Make sure that the charm goes to blocked status - # in hook where a runner may be created. TODO: 2024-07-09 This logic is subject to - # refactoring. + clouds = list(state.charm_config.openstack_clouds_yaml["clouds"].keys()) + if len(clouds) > 1: + logger.warning( + "Multiple clouds defined in clouds.yaml. Using the first one to connect." + ) + cloud_config = OpenStackCloudConfig( + clouds_config=state.charm_config.openstack_clouds_yaml, + cloud=clouds[0], + ) + server_config = None + image_labels = [] image = state.runner_config.openstack_image - image_id = image.id if image and image.id else "" - image_labels = image.tags if image and image.tags else [] + if image and image.id: + server_config = OpenStackServerConfig( + image=image.id, + flavor=state.runner_config.openstack_flavor, + network=state.runner_config.openstack_network, + ) + if image.tags: + image_labels += image.tags - app_name, unit = self.unit.name.rsplit("/", 1) - openstack_runner_manager_config = OpenstackRunnerManagerConfig( - charm_state=state, - path=path, - token=token, - labels=(*state.charm_config.labels, *image_labels), - flavor=state.runner_config.openstack_flavor, - image=image_id, - network=state.runner_config.openstack_network, + runner_config = GitHubRunnerConfig( + github_path=path, labels=(*state.charm_config.labels, *image_labels) + ) + service_config = SupportServiceConfig( + proxy_config=state.proxy_config, dockerhub_mirror=state.charm_config.dockerhub_mirror, - reactive_config=state.reactive_config, + ssh_debug_connections=state.ssh_debug_connections, + repo_policy_compliance=state.charm_config.repo_policy_compliance, ) - return OpenstackRunnerManager( - app_name, - unit, - openstack_runner_manager_config, - state.charm_config.openstack_clouds_yaml, + # The prefix is set to f"{application_name}-{unit number}" + openstack_runner_manager = OpenStackRunnerManager( + manager_name=self.app.name, + prefix=self.unit.name.replace("/", "-"), + cloud_config=cloud_config, + server_config=server_config, + runner_config=runner_config, + service_config=service_config, + ) + runner_manager_config = RunnerManagerConfig( + token=token, + path=path, + ) + runner_manager = RunnerManager( + manager_name=self.app.name, + cloud_runner_manager=openstack_runner_manager, + config=runner_manager_config, ) + return RunnerScaler(runner_manager=runner_manager, reactive_config=state.reactive_config) if __name__ == "__main__": diff --git a/src/charm_state.py b/src/charm_state.py index 492f4b21e..dcd87d122 100644 --- a/src/charm_state.py +++ b/src/charm_state.py @@ -88,7 +88,7 @@ class AnyHttpsUrl(AnyHttpUrl): @dataclasses.dataclass -class GithubRepo: +class GitHubRepo: """Represent GitHub repository. Attributes: @@ -109,7 +109,7 @@ def path(self) -> str: @dataclasses.dataclass -class GithubOrg: +class GitHubOrg: """Represent GitHub organization. Attributes: @@ -129,10 +129,10 @@ def path(self) -> str: return self.org -GithubPath = GithubOrg | GithubRepo +GitHubPath = GitHubOrg | GitHubRepo -def parse_github_path(path_str: str, runner_group: str) -> GithubPath: +def parse_github_path(path_str: str, runner_group: str) -> GitHubPath: """Parse GitHub path. Args: @@ -152,8 +152,8 @@ def parse_github_path(path_str: str, runner_group: str) -> GithubPath: if len(paths) != 2: raise CharmConfigInvalidError(f"Invalid path configuration {path_str}") owner, repo = paths - return GithubRepo(owner=owner, repo=repo) - return GithubOrg(org=path_str, group=runner_group) + return GitHubRepo(owner=owner, repo=repo) + return GitHubOrg(org=path_str, group=runner_group) @dataclasses.dataclass @@ -166,7 +166,7 @@ class GithubConfig: """ token: str - path: GithubPath + path: GitHubPath @classmethod def from_charm(cls, charm: CharmBase) -> "GithubConfig": @@ -410,7 +410,7 @@ class CharmConfig(BaseModel): dockerhub_mirror: AnyHttpsUrl | None labels: tuple[str, ...] openstack_clouds_yaml: OpenStackCloudsYAML | None - path: GithubPath + path: GitHubPath reconcile_interval: int repo_policy_compliance: RepoPolicyComplianceConfig | None token: str diff --git a/src/errors.py b/src/errors.py index 59d28a239..4285dc6e4 100644 --- a/src/errors.py +++ b/src/errors.py @@ -39,6 +39,10 @@ class RunnerAproxyError(RunnerError): """Error for setting up aproxy.""" +class MissingServerConfigError(RunnerError): + """Error for unable to create runner due to missing server configurations.""" + + class MissingRunnerBinaryError(Exception): """Error for missing runner binary.""" diff --git a/src/github_client.py b/src/github_client.py index 3c7718f94..b724b5cdb 100644 --- a/src/github_client.py +++ b/src/github_client.py @@ -16,7 +16,7 @@ from ghapi.page import paged from typing_extensions import assert_never -from charm_state import Arch, GithubOrg, GithubPath, GithubRepo +from charm_state import Arch, GitHubOrg, GitHubPath, GitHubRepo from errors import GithubApiError, JobNotFoundError, RunnerBinaryError, TokenError from github_type import ( JobStats, @@ -88,7 +88,7 @@ def __init__(self, token: str): @catch_http_errors def get_runner_application( - self, path: GithubPath, arch: Arch, os: str = "linux" + self, path: GitHubPath, arch: Arch, os: str = "linux" ) -> RunnerApplication: """Get runner application available for download for given arch. @@ -106,11 +106,11 @@ def get_runner_application( The runner application. """ runner_applications: RunnerApplicationList = [] - if isinstance(path, GithubRepo): + if isinstance(path, GitHubRepo): runner_applications = self._client.actions.list_runner_applications_for_repo( owner=path.owner, repo=path.repo ) - if isinstance(path, GithubOrg): + if isinstance(path, GitHubOrg): runner_applications = self._client.actions.list_runner_applications_for_org( org=path.org ) @@ -127,7 +127,7 @@ def get_runner_application( ) from err @catch_http_errors - def get_runner_github_info(self, path: GithubPath) -> list[SelfHostedRunner]: + def get_runner_github_info(self, path: GitHubPath) -> list[SelfHostedRunner]: """Get runner information on GitHub under a repo or org. Args: @@ -139,7 +139,7 @@ def get_runner_github_info(self, path: GithubPath) -> list[SelfHostedRunner]: """ remote_runners_list: list[SelfHostedRunner] = [] - if isinstance(path, GithubRepo): + if isinstance(path, GitHubRepo): # The documentation of ghapi for pagination is incorrect and examples will give errors. # This workaround is a temp solution. Will be moving to PyGitHub in the future. self._client.actions.list_self_hosted_runners_for_repo( @@ -157,7 +157,7 @@ def get_runner_github_info(self, path: GithubPath) -> list[SelfHostedRunner]: ) for item in page["runners"] ] - if isinstance(path, GithubOrg): + if isinstance(path, GitHubOrg): # The documentation of ghapi for pagination is incorrect and examples will give errors. # This workaround is a temp solution. Will be moving to PyGitHub in the future. self._client.actions.list_self_hosted_runners_for_org(org=path.org, per_page=100) @@ -175,7 +175,7 @@ def get_runner_github_info(self, path: GithubPath) -> list[SelfHostedRunner]: return remote_runners_list @catch_http_errors - def get_runner_remove_token(self, path: GithubPath) -> str: + def get_runner_remove_token(self, path: GitHubPath) -> str: """Get token from GitHub used for removing runners. Args: @@ -185,11 +185,11 @@ def get_runner_remove_token(self, path: GithubPath) -> str: The removing token. """ token: RemoveToken - if isinstance(path, GithubRepo): + if isinstance(path, GitHubRepo): token = self._client.actions.create_remove_token_for_repo( owner=path.owner, repo=path.repo ) - elif isinstance(path, GithubOrg): + elif isinstance(path, GitHubOrg): token = self._client.actions.create_remove_token_for_org(org=path.org) else: assert_never(token) @@ -197,7 +197,7 @@ def get_runner_remove_token(self, path: GithubPath) -> str: return token["token"] @catch_http_errors - def get_runner_registration_token(self, path: GithubPath) -> str: + def get_runner_registration_token(self, path: GitHubPath) -> str: """Get token from GitHub used for registering runners. Args: @@ -208,11 +208,11 @@ def get_runner_registration_token(self, path: GithubPath) -> str: The registration token. """ token: RegistrationToken - if isinstance(path, GithubRepo): + if isinstance(path, GitHubRepo): token = self._client.actions.create_registration_token_for_repo( owner=path.owner, repo=path.repo ) - elif isinstance(path, GithubOrg): + elif isinstance(path, GitHubOrg): token = self._client.actions.create_registration_token_for_org(org=path.org) else: assert_never(token) @@ -220,7 +220,7 @@ def get_runner_registration_token(self, path: GithubPath) -> str: return token["token"] @catch_http_errors - def delete_runner(self, path: GithubPath, runner_id: int) -> None: + def delete_runner(self, path: GitHubPath, runner_id: int) -> None: """Delete the self-hosted runner from GitHub. Args: @@ -228,19 +228,19 @@ def delete_runner(self, path: GithubPath, runner_id: int) -> None: name. runner_id: Id of the runner. """ - if isinstance(path, GithubRepo): + if isinstance(path, GitHubRepo): self._client.actions.delete_self_hosted_runner_from_repo( owner=path.owner, repo=path.repo, runner_id=runner_id, ) - if isinstance(path, GithubOrg): + if isinstance(path, GitHubOrg): self._client.actions.delete_self_hosted_runner_from_org( org=path.org, runner_id=runner_id, ) - def get_job_info(self, path: GithubRepo, workflow_run_id: str, runner_name: str) -> JobStats: + def get_job_info(self, path: GitHubRepo, workflow_run_id: str, runner_name: str) -> JobStats: """Get information about a job for a specific workflow run. Args: diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py index 28ed17b20..aff75ed41 100644 --- a/src/manager/cloud_runner_manager.py +++ b/src/manager/cloud_runner_manager.py @@ -9,7 +9,7 @@ from enum import Enum, auto from typing import Iterator, Sequence, Tuple -from charm_state import GithubPath, ProxyConfig, SSHDebugConnection +from charm_state import GitHubPath, ProxyConfig, RepoPolicyComplianceConfig, SSHDebugConnection from metrics.runner import RunnerMetrics logger = logging.getLogger(__name__) @@ -52,8 +52,9 @@ class CloudRunnerState(str, Enum): UNKNOWN = auto() UNEXPECTED = auto() + # Exclude from coverage as not much value for testing this object conversion. @staticmethod - def from_openstack_server_status( + def from_openstack_server_status( # pragma: no cover openstack_server_status: str, ) -> "CloudRunnerState": """Create from openstack server status. @@ -97,7 +98,7 @@ class GitHubRunnerConfig: labels: The labels to add to runners. """ - github_path: GithubPath + github_path: GitHubPath labels: list[str] @@ -109,15 +110,13 @@ class SupportServiceConfig: proxy_config: The proxy configuration. dockerhub_mirror: The dockerhub mirror to use for runners. ssh_debug_connections: The information on the ssh debug services. - repo_policy_url: The URL of the repo policy service. - repo_policy_token: The token to access the repo policy service. + repo_policy_compliance: The configuration of the repo policy compliance service. """ proxy_config: ProxyConfig | None dockerhub_mirror: str | None ssh_debug_connections: list[SSHDebugConnection] | None - repo_policy_url: str | None - repo_policy_token: str | None + repo_policy_compliance: RepoPolicyComplianceConfig | None @dataclass @@ -158,7 +157,7 @@ def create_runner(self, registration_token: str) -> InstanceId: """ @abc.abstractmethod - def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance: + def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: """Get a self-hosted runner by instance id. Args: diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py index 0aed972bd..949a1df38 100644 --- a/src/manager/github_runner_manager.py +++ b/src/manager/github_runner_manager.py @@ -4,9 +4,9 @@ """Client for managing self-hosted runner on GitHub side.""" from enum import Enum, auto -from typing import Sequence +from typing import Iterable -from charm_state import GithubPath +from charm_state import GitHubPath from github_client import GithubClient from github_type import GitHubRunnerStatus, SelfHostedRunner @@ -37,18 +37,19 @@ def from_runner(runner: SelfHostedRunner) -> "GitHubRunnerState": """ state = GitHubRunnerState.OFFLINE # A runner that is busy and offline is possible. - if runner.busy: + if runner["busy"]: state = GitHubRunnerState.BUSY - if runner.status == GitHubRunnerStatus.ONLINE: - if not runner.busy: + if runner["status"] == GitHubRunnerStatus.ONLINE: + if not runner["busy"]: state = GitHubRunnerState.IDLE return state -class GithubRunnerManager: +# Thin wrapper around the GitHub Client. Not much value in unit testing. +class GitHubRunnerManager: # pragma: no cover """Manage self-hosted runner on GitHub side.""" - def __init__(self, prefix: str, token: str, path: GithubPath): + def __init__(self, prefix: str, token: str, path: GitHubPath): """Construct the object. Args: @@ -61,8 +62,8 @@ def __init__(self, prefix: str, token: str, path: GithubPath): self.github = GithubClient(token) def get_runners( - self, states: Sequence[GitHubRunnerState] | None = None - ) -> tuple[SelfHostedRunner]: + self, states: Iterable[GitHubRunnerState] | None = None + ) -> tuple[SelfHostedRunner, ...]: """Get info on self-hosted runners of certain states. Args: @@ -72,14 +73,19 @@ def get_runners( Information on the runners. """ runner_list = self.github.get_runner_github_info(self._path) + runner_list = [runner for runner in runner_list if runner.name.startswith(self._prefix)] + + if states is None: + return tuple(runner_list) + + state_set = set(states) return tuple( runner for runner in runner_list - if runner.name.startswith(self._prefix) - and GithubRunnerManager._is_runner_in_state(runner, states) + if GitHubRunnerManager._is_runner_in_state(runner, state_set) ) - def delete_runners(self, states: Sequence[GitHubRunnerState] | None = None) -> None: + def delete_runners(self, states: Iterable[GitHubRunnerState] | None = None) -> None: """Delete the self-hosted runners of certain states. Args: @@ -111,7 +117,7 @@ def get_removal_token(self) -> str: @staticmethod def _is_runner_in_state( - runner: SelfHostedRunner, states: Sequence[GitHubRunnerState] | None + runner: SelfHostedRunner, states: set[GitHubRunnerState] | None ) -> bool: """Check that the runner is in one of the states provided. diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py index 048b9c628..72ded77fb 100644 --- a/src/manager/runner_manager.py +++ b/src/manager/runner_manager.py @@ -9,7 +9,7 @@ from multiprocessing import Pool from typing import Iterator, Sequence, Type, cast -from charm_state import GithubPath +from charm_state import GitHubPath from errors import GithubMetricsError, RunnerCreateError from github_type import SelfHostedRunner from manager.cloud_runner_manager import ( @@ -19,7 +19,7 @@ HealthState, InstanceId, ) -from manager.github_runner_manager import GithubRunnerManager, GitHubRunnerState +from manager.github_runner_manager import GitHubRunnerManager, GitHubRunnerState from metrics import events as metric_events from metrics import github as github_metrics from metrics import runner as runner_metrics @@ -86,27 +86,35 @@ class RunnerManagerConfig: """ token: str - path: GithubPath + path: GitHubPath class RunnerManager: """Manage the runners. Attributes: + manager_name: A name to identify this manager. name_prefix: The name prefix of the runners. """ - def __init__(self, cloud_runner_manager: CloudRunnerManager, config: RunnerManagerConfig): + def __init__( + self, + manager_name: str, + cloud_runner_manager: CloudRunnerManager, + config: RunnerManagerConfig, + ): """Construct the object. Args: + manager_name: A name to identify this manager. cloud_runner_manager: For managing the cloud instance of the runner. config: Configuration of this class. """ + self.manager_name = manager_name self._config = config self._cloud = cloud_runner_manager self.name_prefix = self._cloud.name_prefix - self._github = GithubRunnerManager( + self._github = GitHubRunnerManager( prefix=self.name_prefix, token=self._config.token, path=self._config.path ) @@ -125,21 +133,7 @@ def create_runners(self, num: int) -> tuple[InstanceId]: create_runner_args = [ RunnerManager._CreateRunnerArgs(self._cloud, registration_token) for _ in range(num) ] - instance_id_list = [] - with Pool(processes=min(num, 10)) as pool: - jobs = pool.imap_unordered( - func=RunnerManager._create_runner, iterable=create_runner_args - ) - for _ in range(num): - try: - instance_id = next(jobs) - except RunnerCreateError: - logger.exception("Failed to spawn a runner.") - except StopIteration: - break - else: - instance_id_list.append(instance_id) - return tuple(instance_id_list) + return RunnerManager._spawn_runners(create_runner_args) def get_runners( self, @@ -162,7 +156,7 @@ def get_runners( logger.info("Getting runners...") github_infos = self._github.get_runners(github_states) cloud_infos = self._cloud.get_runners(cloud_states) - github_infos_map = {info.name: info for info in github_infos} + github_infos_map = {info["name"]: info for info in github_infos} cloud_infos_map = {info.name: info for info in cloud_infos} logger.info( "Found following runners: %s", cloud_infos_map.keys() | github_infos_map.keys() @@ -254,6 +248,39 @@ def cleanup(self) -> IssuedMetricEventsStats: deleted_runner_metrics = self._cloud.cleanup(remove_token) return self._issue_runner_metrics(metrics=deleted_runner_metrics) + @staticmethod + def _spawn_runners( + create_runner_args: Sequence["RunnerManager._CreateRunnerArgs"], + ) -> tuple[InstanceId, ...]: + """Parallel spawn of runners. + + The length of the create_runner_args is number _create_runner invocation, and therefore the + number of runner spawned. + + Args: + create_runner_args: List of arg for invoking _create_runner method. + + Returns: + A list of instance ID of runner spawned. + """ + num = len(create_runner_args) + + instance_id_list = [] + with Pool(processes=min(num, 10)) as pool: + jobs = pool.imap_unordered( + func=RunnerManager._create_runner, iterable=create_runner_args + ) + for _ in range(num): + try: + instance_id = next(jobs) + except RunnerCreateError: + logger.exception("Failed to spawn a runner.") + except StopIteration: + break + else: + instance_id_list.append(instance_id) + return tuple(instance_id_list) + def _delete_runners( self, runners: Sequence[RunnerInstance], remove_token: str ) -> IssuedMetricEventsStats: @@ -302,7 +329,7 @@ def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetri issued_events = runner_metrics.issue_events( runner_metrics=extracted_metrics, job_metrics=job_metrics, - flavor=self.name_prefix, + flavor=self.manager_name, ) for event_type in issued_events: diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py new file mode 100644 index 000000000..271b92e51 --- /dev/null +++ b/src/manager/runner_scaler.py @@ -0,0 +1,215 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Module for scaling the runners amount.""" + +import logging +import time +from dataclasses import dataclass + +from pydantic import MongoDsn + +import reactive.runner_manager as reactive_runner_manager +from charm_state import ReactiveConfig +from errors import IssueMetricEventError, MissingServerConfigError +from manager.cloud_runner_manager import HealthState +from manager.github_runner_manager import GitHubRunnerState +from manager.runner_manager import FlushMode, RunnerManager +from metrics import events as metric_events + +logger = logging.getLogger(__name__) + + +@dataclass +class RunnerInfo: + """Information on the runners. + + Attributes: + online: The number of runner in online state. + busy: The number of the runner in busy state. + offline: The number of runner in offline state. + unknown: The number of runner in unknown state. + runners: The names of the online runners. + busy_runners: The names of the busy runners. + """ + + online: int + busy: int + offline: int + unknown: int + runners: tuple[str, ...] + busy_runners: tuple[str, ...] + + +class RunnerScaler: + """Manage the reconcile of runners.""" + + def __init__(self, runner_manager: RunnerManager, reactive_config: ReactiveConfig | None): + """Construct the object. + + Args: + runner_manager: The RunnerManager to perform runner reconcile. + reactive_config: Reactive runner configuration. + """ + self._manager = runner_manager + self._reactive_config = reactive_config + + def get_runner_info(self) -> RunnerInfo: + """Get information on the runners. + + Returns: + The information on the runners. + """ + runner_list = self._manager.get_runners() + online = 0 + busy = 0 + offline = 0 + unknown = 0 + online_runners = [] + busy_runners = [] + for runner in runner_list: + match runner.github_state: + case GitHubRunnerState.BUSY: + online += 1 + online_runners.append(runner.name) + busy += 1 + busy_runners.append(runner.name) + case GitHubRunnerState.IDLE: + online += 1 + online_runners.append(runner.name) + case GitHubRunnerState.OFFLINE: + offline += 1 + case _: + unknown += 1 + return RunnerInfo( + online=online, + busy=busy, + offline=offline, + unknown=unknown, + runners=tuple(online_runners), + busy_runners=tuple(busy_runners), + ) + + def flush(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: + """Flush the runners. + + Args: + flush_mode: Determines the types of runner to be flushed. + + Returns: + Number of runners flushed. + """ + metric_stats = self._manager.cleanup() + delete_metric_stats = self._manager.flush_runners(flush_mode=flush_mode) + events = set(delete_metric_stats.keys()) | set(metric_stats.keys()) + metric_stats = { + event_name: delete_metric_stats.get(event_name, 0) + metric_stats.get(event_name, 0) + for event_name in events + } + return metric_stats.get(metric_events.RunnerStop, 0) + + def reconcile(self, quantity: int) -> int: + """Reconcile the quantity of runners. + + Args: + quantity: The number of intended runners. + + Returns: + The Change in number of runners. + """ + logger.info("Start reconcile to %s runner", quantity) + + if self._reactive_config is not None: + logger.info("Reactive configuration detected, going into experimental reactive mode.") + return self._reconcile_reactive(quantity, self._reactive_config.mq_uri) + + start_timestamp = time.time() + delete_metric_stats = None + metric_stats = self._manager.cleanup() + runners = self._manager.get_runners() + logger.info("Reconcile runners from %s to %s", len(runners), quantity) + runner_diff = quantity - len(runners) + if runner_diff > 0: + try: + self._manager.create_runners(runner_diff) + except MissingServerConfigError: + logging.exception( + "Unable to spawn runner due to missing server configuration, such as, image." + ) + elif runner_diff < 0: + delete_metric_stats = self._manager.delete_runners(-runner_diff) + else: + logger.info("No changes to the number of runners.") + end_timestamp = time.time() + + # Merge the two metric stats. + if delete_metric_stats is not None: + metric_stats = { + event_name: delete_metric_stats.get(event_name, 0) + + metric_stats.get(event_name, 0) + for event_name in set(delete_metric_stats) | set(metric_stats) + } + + runner_list = self._manager.get_runners() + busy_runners = [ + runner for runner in runner_list if runner.github_state == GitHubRunnerState.BUSY + ] + idle_runners = [ + runner for runner in runner_list if runner.github_state == GitHubRunnerState.IDLE + ] + offline_healthy_runners = [ + runner + for runner in runner_list + if runner.github_state == GitHubRunnerState.OFFLINE + and runner.health == HealthState.HEALTHY + ] + unhealthy_states = set((HealthState.UNHEALTHY, HealthState.UNKNOWN)) + unhealthy_runners = [runner for runner in runner_list if runner.health in unhealthy_states] + logger.info("Found %s busy runners: %s", len(busy_runners), busy_runners) + logger.info("Found %s idle runners: %s", len(idle_runners), idle_runners) + logger.info( + "Found %s offline runners that are healthy: %s", + len(offline_healthy_runners), + offline_healthy_runners, + ) + logger.info("Found %s unhealthy runners: %s", len(unhealthy_runners), unhealthy_runners) + + try: + available_runners = set(runner.name for runner in idle_runners) | set( + runner.name for runner in offline_healthy_runners + ) + logger.info( + "Current available runners (idle + healthy offline): %s", available_runners + ) + metric_events.issue_event( + metric_events.Reconciliation( + timestamp=time.time(), + flavor=self._manager.manager_name, + crashed_runners=metric_stats.get(metric_events.RunnerStart, 0) + - metric_stats.get(metric_events.RunnerStop, 0), + idle_runners=len(available_runners), + duration=end_timestamp - start_timestamp, + ) + ) + except IssueMetricEventError: + logger.exception("Failed to issue Reconciliation metric") + + return runner_diff + + def _reconcile_reactive(self, quantity: int, mq_uri: MongoDsn) -> int: + """Reconcile runners reactively. + + Args: + quantity: Number of intended runners. + mq_uri: The URI of the MQ to use to spawn runners reactively. + + Returns: + The difference between intended runners and actual runners. In reactive mode + this number is never negative as additional processes should terminate after a timeout. + """ + logger.info("Reactive mode is experimental and not yet fully implemented.") + return reactive_runner_manager.reconcile( + quantity=quantity, + mq_uri=mq_uri, + queue_name=self._manager.manager_name, + ) diff --git a/src/metrics/github.py b/src/metrics/github.py index 354933fea..e40574eb7 100644 --- a/src/metrics/github.py +++ b/src/metrics/github.py @@ -4,7 +4,7 @@ """Functions to calculate metrics from data retrieved from GitHub.""" import logging -from charm_state import GithubRepo +from charm_state import GitHubRepo from errors import GithubMetricsError, JobNotFoundError from github_client import GithubClient from metrics.runner import PreJobMetrics @@ -35,7 +35,7 @@ def job( try: job_info = github_client.get_job_info( - path=GithubRepo(owner=owner, repo=repo), + path=GitHubRepo(owner=owner, repo=repo), workflow_run_id=pre_job_metrics.workflow_run_id, runner_name=runner_name, ) diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py deleted file mode 100644 index 379d2ae4c..000000000 --- a/src/openstack_cloud/openstack_manager.py +++ /dev/null @@ -1,1598 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -# TODO: 2024-04-11 The module contains too many lines which are scheduled for refactoring. -# pylint: disable=too-many-lines - -# TODO: 2024-04-22 The module contains duplicate code which is scheduled for refactoring. -# Lines related to issuing metrics are duplicated: -# ==openstack_cloud.openstack_manager:[1320:1337] -# ==runner_manager:[383:413] -# ==openstack_cloud.openstack_manager:[1283:1314] -# ==runner_manager:[339:368] - -# pylint: disable=duplicate-code - -"""Module for handling interactions with OpenStack.""" -import logging -import secrets -import shutil -import time -from contextlib import contextmanager -from dataclasses import dataclass -from datetime import datetime -from multiprocessing import Pool -from pathlib import Path -from typing import Iterable, Iterator, Literal, Optional, cast - -import invoke -import jinja2 -import openstack -import openstack.connection -import openstack.exceptions -import openstack.image.v2.image -import paramiko -from fabric import Connection as SSHConnection -from openstack.compute.v2.server import Server -from openstack.connection import Connection as OpenstackConnection -from openstack.exceptions import SDKException -from openstack.network.v2.security_group import SecurityGroup -from paramiko.ssh_exception import NoValidConnectionsError - -import reactive.runner_manager as reactive_runner_manager -from charm_state import CharmState, GithubOrg, ProxyConfig, SSHDebugConnection -from errors import ( - CreateMetricsStorageError, - GetMetricsStorageError, - GithubApiError, - GithubClientError, - GithubMetricsError, - IssueMetricEventError, - OpenStackError, - RunnerCreateError, - RunnerStartError, -) -from github_client import GithubClient -from github_type import GitHubRunnerStatus, SelfHostedRunner -from metrics import events as metric_events -from metrics import github as github_metrics -from metrics import runner as runner_metrics -from metrics import storage as metrics_storage -from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME -from repo_policy_compliance_client import RepoPolicyComplianceClient -from runner_manager import IssuedMetricEventsStats -from runner_manager_type import FlushMode, OpenstackRunnerManagerConfig -from runner_type import GithubPath, RunnerGithubInfo, RunnerNameByHealth -from utilities import retry, set_env_var - -logger = logging.getLogger(__name__) - -# Update the version when the security group rules are not backward compatible. -SECURITY_GROUP_NAME = "github-runner-v1" -BUILD_OPENSTACK_IMAGE_SCRIPT_FILENAME = "scripts/build-openstack-image.sh" -_SSH_KEY_PATH = Path("/home/ubuntu/.ssh") -_CONFIG_SCRIPT_PATH = Path("/home/ubuntu/actions-runner/config.sh") - -RUNNER_APPLICATION = Path("/home/ubuntu/actions-runner") -METRICS_EXCHANGE_PATH = Path("/home/ubuntu/metrics-exchange") -PRE_JOB_SCRIPT = RUNNER_APPLICATION / "pre-job.sh" -MAX_METRICS_FILE_SIZE = 1024 - -RUNNER_STARTUP_PROCESS = "/home/ubuntu/actions-runner/run.sh" -RUNNER_LISTENER_PROCESS = "Runner.Listener" -RUNNER_WORKER_PROCESS = "Runner.Worker" -CREATE_SERVER_TIMEOUT = 5 * 60 - - -class _PullFileError(Exception): - """Represents an error while pulling a file from the runner instance.""" - - def __init__(self, reason: str): - """Construct PullFileError object. - - Args: - reason: The reason for the error. - """ - super().__init__(reason) - - -class _SSHError(Exception): - """Represents an error while interacting with SSH.""" - - def __init__(self, reason: str): - """Construct SSHErrors object. - - Args: - reason: The reason for the error. - """ - super().__init__(reason) - - -@dataclass -class InstanceConfig: - """The configuration values for creating a single runner instance. - - Attributes: - github_path: The GitHub repo/org path to register the runner. - image_id: The Openstack image id to use to boot the instance with. - labels: The runner instance labels. - name: Name of the image to launch the GitHub runner instance with. - registration_token: Token for registering the runner on GitHub. - """ - - github_path: GithubPath - image_id: str - labels: Iterable[str] - name: str - registration_token: str - - -SupportedCloudImageArch = Literal["amd64", "arm64"] - - -@dataclass -class _CloudInitUserData: - """Dataclass to hold cloud init userdata. - - Attributes: - instance_config: The configuration values for Openstack instance to launch. - runner_env: The contents of .env to source when launching Github runner. - pre_job_contents: The contents of pre-job script to run before starting the job. - proxies: Proxy values to enable on the Github runner. - dockerhub_mirror: URL to dockerhub mirror. - """ - - instance_config: InstanceConfig - runner_env: str - pre_job_contents: str - dockerhub_mirror: Optional[str] = None - proxies: Optional[ProxyConfig] = None - - -@contextmanager -def _create_connection(cloud_config: dict[str, dict]) -> Iterator[openstack.connection.Connection]: - """Create a connection context managed object, to be used within with statements. - - This method should be called with a valid cloud_config. See _validate_cloud_config. - Also, this method assumes that the clouds.yaml exists on ~/.config/openstack/clouds.yaml. - See charm_state.py _write_openstack_config_to_disk. - - Args: - cloud_config: The configuration in clouds.yaml format to apply. - - Raises: - OpenStackError: if the credentials provided is not authorized. - - Yields: - An openstack.connection.Connection object. - """ - clouds = list(cloud_config["clouds"].keys()) - if len(clouds) > 1: - logger.warning("Multiple clouds defined in clouds.yaml. Using the first one to connect.") - cloud_name = clouds[0] - - # api documents that keystoneauth1.exceptions.MissingRequiredOptions can be raised but - # I could not reproduce it. Therefore, no catch here for such exception. - try: - with openstack.connect(cloud=cloud_name) as conn: - conn.authorize() - yield conn - # pylint thinks this isn't an exception, but does inherit from Exception class. - except openstack.exceptions.HttpException as exc: # pylint: disable=bad-exception-cause - logger.exception("OpenStack API call failure") - raise OpenStackError("Failed OpenStack API call") from exc - - -# Disable too many arguments, as they are needed to create the dataclass. -def create_instance_config( # pylint: disable=too-many-arguments - app_name: str, - unit_num: int, - image_id: str, - path: GithubPath, - labels: Iterable[str], - registration_token: str, -) -> InstanceConfig: - """Create an instance config from charm data. - - Args: - app_name: The juju application name. - unit_num: The juju unit number. - image_id: The openstack image id to create the instance with. - path: Github organisation or repository path. - labels: Addition labels for the runner. - registration_token: The Github runner registration token. See \ - https://docs.github.com/en/rest/actions/self-hosted-runners?apiVersion=2022-11-28#create-a-registration-token-for-a-repository - - Returns: - Instance configuration created. - """ - suffix = secrets.token_hex(12) - return InstanceConfig( - github_path=path, - image_id=image_id, - labels=labels, - name=f"{app_name}-{unit_num}-{suffix}", - registration_token=registration_token, - ) - - -def _generate_runner_env( - templates_env: jinja2.Environment, - dockerhub_mirror: Optional[str] = None, - ssh_debug_connections: list[SSHDebugConnection] | None = None, -) -> str: - """Generate Github runner .env file contents. - - Proxy configuration are handled by aproxy. - - Args: - templates_env: The jinja template environment. - dockerhub_mirror: The url to Dockerhub to reduce rate limiting. - ssh_debug_connections: Tmate SSH debug connection information to load as environment vars. - - Returns: - The .env contents to be loaded by Github runner. - """ - return templates_env.get_template("env.j2").render( - pre_job_script=str(PRE_JOB_SCRIPT), - dockerhub_mirror=dockerhub_mirror or "", - ssh_debug_info=(secrets.choice(ssh_debug_connections) if ssh_debug_connections else None), - ) - - -def _generate_cloud_init_userdata( - templates_env: jinja2.Environment, - cloud_init_userdata: _CloudInitUserData, -) -> str: - """Generate cloud init userdata to launch at startup. - - Args: - templates_env: The jinja template environment. - cloud_init_userdata: The dataclass containing the cloud init userdata. - - Returns: - The cloud init userdata script. - """ - runner_group = None - instance_config = cloud_init_userdata.instance_config - proxies = cloud_init_userdata.proxies - - if isinstance(instance_config.github_path, GithubOrg): - runner_group = instance_config.github_path.group - - aproxy_address = proxies.aproxy_address if proxies is not None else None - return templates_env.get_template("openstack-userdata.sh.j2").render( - github_url=f"https://github.com/{instance_config.github_path.path()}", - runner_group=runner_group, - token=instance_config.registration_token, - instance_labels=",".join(instance_config.labels), - instance_name=instance_config.name, - env_contents=cloud_init_userdata.runner_env, - pre_job_contents=cloud_init_userdata.pre_job_contents, - metrics_exchange_path=str(METRICS_EXCHANGE_PATH), - aproxy_address=aproxy_address, - dockerhub_mirror=cloud_init_userdata.dockerhub_mirror, - ) - - -class GithubRunnerRemoveError(Exception): - """Represents an error removing registered runner from Github.""" - - -_INSTANCE_STATUS_SHUTOFF = "SHUTOFF" -_INSTANCE_STATUS_ERROR = "ERROR" -_INSTANCE_STATUS_ACTIVE = "ACTIVE" -_INSTANCE_STATUS_BUILDING = "BUILDING" - - -class OpenstackRunnerManager: - """Runner manager for OpenStack-based instances. - - Attributes: - app_name: The juju application name. - unit_num: The juju unit number. - instance_name: Prefix of the name for the set of runners. - """ - - def __init__( - self, - app_name: str, - unit_num: int, - openstack_runner_manager_config: OpenstackRunnerManagerConfig, - cloud_config: dict[str, dict], - ): - """Construct OpenstackRunnerManager object. - - Args: - app_name: The juju application name. - unit_num: The juju unit number. - openstack_runner_manager_config: Configurations related to runner manager. - cloud_config: The openstack clouds.yaml in dict format. - """ - # Setting the env var to this process and any child process spawned. - proxies = openstack_runner_manager_config.charm_state.proxy_config - if no_proxy := proxies.no_proxy: - set_env_var("NO_PROXY", no_proxy) - if http_proxy := proxies.http: - set_env_var("HTTP_PROXY", http_proxy) - if https_proxy := proxies.https: - set_env_var("HTTPS_PROXY", https_proxy) - - self.app_name = app_name - self.unit_num = unit_num - self.instance_name = f"{app_name}-{unit_num}" - self._config = openstack_runner_manager_config - self._cloud_config = cloud_config - self._github = GithubClient(token=self._config.token) - - def reconcile(self, quantity: int) -> int: - """Reconcile the quantity of runners. - - Args: - quantity: The number of intended runners. - - Returns: - The change in number of runners. - """ - if self._config.reactive_config: - logger.info("Reactive configuration detected, going into experimental reactive mode.") - return self._reconcile_reactive(quantity) - - start_ts = time.time() - try: - delta = self._reconcile_runners(quantity) - finally: - end_ts = time.time() - self._issue_reconciliation_metrics( - reconciliation_start_ts=start_ts, reconciliation_end_ts=end_ts - ) - - return delta - - def _reconcile_reactive(self, quantity: int) -> int: - """Reconcile runners reactively. - - Args: - quantity: Number of intended runners. - - Returns: - The difference between intended runners and actual runners. In reactive mode - this number is never negative as additional processes should terminate after a timeout. - """ - logger.info("Reactive mode is experimental and not yet fully implemented.") - return reactive_runner_manager.reconcile( - quantity=quantity, mq_uri=self._config.reactive_config.mq_uri, queue_name=self.app_name - ) - - def _reconcile_runners(self, quantity: int) -> int: - """Reconcile the number of runners. - - Args: - quantity: The number of intended runners. - - Returns: - The change in number of runners. - """ - with _create_connection(self._cloud_config) as conn: - runner_by_health = self._get_openstack_runner_status(conn) - logger.info( - "Found %s healthy runner and %s unhealthy runner", - len(runner_by_health.healthy), - len(runner_by_health.unhealthy), - ) - logger.debug("Healthy runner: %s", runner_by_health.healthy) - logger.debug("Unhealthy runner: %s", runner_by_health.unhealthy) - remove_token = self._github.get_runner_remove_token(path=self._config.path) - - self._clean_up_runners( - conn=conn, runner_by_health=runner_by_health, remove_token=remove_token - ) - - delta = self._scale( - quantity=quantity, - conn=conn, - runner_by_health=runner_by_health, - remove_token=remove_token, - ) - return delta - - def get_github_runner_info(self) -> tuple[RunnerGithubInfo, ...]: - """Get information on GitHub for the runners. - - Returns: - Collection of runner GitHub information. - """ - remote_runners_list: list[SelfHostedRunner] = self._github.get_runner_github_info( - self._config.path - ) - logger.debug("List of runners found on GitHub:%s", remote_runners_list) - return tuple( - RunnerGithubInfo( - runner["name"], - runner["id"], - runner["status"] == GitHubRunnerStatus.ONLINE, - runner["busy"], - ) - for runner in remote_runners_list - if runner["name"].startswith(f"{self.instance_name}-") - ) - - def _get_openstack_runner_status(self, conn: OpenstackConnection) -> RunnerNameByHealth: - """Get status on OpenStack of each runner. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - Runner status grouped by health. - """ - healthy_runner = [] - unhealthy_runner = [] - openstack_instances = self._get_openstack_instances(conn) - - logger.debug("Found openstack instances: %s", openstack_instances) - - for instance in openstack_instances: - if not OpenstackRunnerManager._health_check(conn=conn, server_name=instance.name): - unhealthy_runner.append(instance.name) - else: - healthy_runner.append(instance.name) - - return RunnerNameByHealth(healthy=tuple(healthy_runner), unhealthy=tuple(unhealthy_runner)) - - def _get_openstack_instances(self, conn: OpenstackConnection) -> list[Server]: - """Get the OpenStack servers managed by this unit. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - List of OpenStack instances. - """ - return [ - instance - for instance in cast(list[Server], conn.list_servers()) - if instance.name.startswith(f"{self.instance_name}-") - ] - - @staticmethod - def _health_check( - conn: OpenstackConnection, - server_name: str, - startup: bool = False, - ) -> bool: - """Health check a server instance. - - A healthy server is defined as: - 1. Openstack instance status is ACTIVE or BUILDING. - 2. Openstack instance status is in BUILDING less than CREATE_SERVER_TIMEOUT seconds. - 3. Runner.Worker exists (running a job). - 4. Runner.Listener exists (waiting for job). - 5. GitHub runner status is Idle or Active. - - An undetermined server is marked as healthy when: - 1. SSH fails - could be a transient network error. - 2. The Runner.* processes do not exist. Mark healthy for now to gather data. This is - subject to change to unhealthy once enough data has been gathered. - - Args: - conn: The Openstack connection instance. - server_name: The name of the OpenStack server to health check. - startup: Check only whether the startup is successful. - - Returns: - Whether the instance is healthy. - """ - server: Server | None = conn.get_server(name_or_id=server_name) - if not server: - return False - if server.status == (_INSTANCE_STATUS_SHUTOFF, _INSTANCE_STATUS_ERROR): - return False - if server.status not in (_INSTANCE_STATUS_ACTIVE, _INSTANCE_STATUS_BUILDING): - return False - created_at = datetime.strptime(server.created_at, "%Y-%m-%dT%H:%M:%SZ") - current_time = datetime.now(created_at.tzinfo) - elapsed_min = (created_at - current_time).total_seconds() - if server.status == _INSTANCE_STATUS_BUILDING: - return elapsed_min < CREATE_SERVER_TIMEOUT - try: - return OpenstackRunnerManager._ssh_health_check( - conn=conn, server_name=server_name, startup=startup - ) - except _SSHError: - logger.warning("Health check failed, unable to SSH into server: %s", server_name) - return False - - @staticmethod - @retry(tries=3, delay=5, max_delay=60, backoff=2, local_logger=logger) - def _ssh_health_check(conn: OpenstackConnection, server_name: str, startup: bool) -> bool: - """Use SSH to check whether runner application is running. - - A healthy runner is defined as: - 1. SSH connection can be established. - 2. Runner.Worker exists (running a job). - 3. Runner.Listener exists (waiting for job). - - Args: - conn: The Openstack connection instance. - server_name: The openstack server instance to check connections. - startup: Check only whether the startup is successful. - - Raises: - _SSHError: if there was an error SSH-ing into the machine or with the SSH command. - - Returns: - Whether the runner application is running. - """ - try: - ssh_conn = OpenstackRunnerManager._get_ssh_connection( - conn=conn, server_name=server_name - ) - except _SSHError as exc: - logger.error("[ALERT]: Unable to SSH to server: %s, reason: %s", server_name, str(exc)) - raise - - result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) - logger.debug("Output of `ps aux` on %s stderr: %s", server_name, result.stderr) - if not result.ok: - logger.warning("List all process command failed on %s.", server_name) - raise _SSHError(f"List process command failed on {server_name}.") - if RUNNER_STARTUP_PROCESS not in result.stdout: - logger.warning("No startup process found on server %s.", server_name) - raise _SSHError(f"Runner not yet started on {server_name}.") - - logger.info("Runner process found to be healthy on %s", server_name) - if startup: - return True - - if RUNNER_WORKER_PROCESS in result.stdout or RUNNER_LISTENER_PROCESS in result.stdout: - return True - - return False - - @staticmethod - @retry(tries=3, delay=5, max_delay=60, backoff=2, local_logger=logger) - def _get_ssh_connection( - conn: OpenstackConnection, server_name: str, timeout: int = 30 - ) -> SSHConnection: - """Get a valid ssh connection within a network for a given openstack instance. - - The SSH connection will attempt to establish connection until the timeout configured. - - Args: - conn: The Openstack connection instance. - server_name: The Openstack server instance name. - timeout: Timeout in seconds to attempt connection to each available server address. - - Raises: - _SSHError: If there was an error getting a valid SSH connection. - - Returns: - An SSH connection to OpenStack server instance. - """ - server: Server | None = conn.get_server(name_or_id=server_name) - if server is None: - raise _SSHError(f"Server gone while trying to get SSH connection: {server_name}.") - if not server.key_name: - raise _SSHError( - f"Unable to create SSH connection, no valid keypair found for {server.name}" - ) - key_path = OpenstackRunnerManager._get_key_path(server.name) - if not key_path.exists(): - raise _SSHError(f"Missing keyfile for server: {server.name}, key path: {key_path}") - network_address_list = server.addresses.values() - if not network_address_list: - raise _SSHError(f"No addresses found for OpenStack server {server.name}") - - server_addresses: list[str] = [ - address["addr"] - for network_addresses in network_address_list - for address in network_addresses - ] - for ip in server_addresses: - try: - connection = SSHConnection( - host=ip, - user="ubuntu", - connect_kwargs={"key_filename": str(key_path)}, - connect_timeout=timeout, - ) - result = connection.run("echo hello world", warn=True, timeout=timeout) - if not result.ok: - logger.warning( - "SSH test connection failed, server: %s, address: %s", server.name, ip - ) - continue - if "hello world" in result.stdout: - return connection - except (NoValidConnectionsError, TimeoutError, paramiko.ssh_exception.SSHException): - logger.warning( - "Unable to SSH into %s with address %s", - server.name, - connection.host, - exc_info=True, - ) - continue - raise _SSHError( - f"No connectable SSH addresses found, server: {server.name}, " - f"addresses: {server_addresses}" - ) - - @staticmethod - def _get_key_path(name: str) -> Path: - """Get the filepath for storing private SSH of a runner. - - Args: - name: The name of the runner. - - Returns: - Path to reserved for the key file of the runner. - """ - return _SSH_KEY_PATH / f"runner-{name}.key" - - @dataclass - class _CreateRunnerArgs: - """Arguments for _create_runner method. - - Attributes: - app_name: The juju application name. - cloud_config: The clouds.yaml containing the OpenStack credentials. The first cloud - in the file will be used. - config: Configurations related to runner manager. - registration_token: Token for registering the runner on GitHub. - unit_num: The juju unit number. - """ - - app_name: str - cloud_config: dict[str, dict] - config: OpenstackRunnerManagerConfig - registration_token: str - unit_num: int - - @staticmethod - def _create_runner(args: _CreateRunnerArgs) -> None: - """Create a runner on OpenStack cloud. - - Arguments are gathered into a dataclass due to Pool.map needing one argument functions. - - Args: - args: Arguments of the method. - - Raises: - RunnerCreateError: Unable to create the OpenStack runner. - """ - ts_now = time.time() - environment = jinja2.Environment( - loader=jinja2.FileSystemLoader("templates"), autoescape=True - ) - - env_contents = _generate_runner_env( - templates_env=environment, - dockerhub_mirror=args.config.dockerhub_mirror, - ssh_debug_connections=args.config.charm_state.ssh_debug_connections, - ) - - pre_job_contents = OpenstackRunnerManager._render_pre_job_contents( - charm_state=args.config.charm_state, templates_env=environment - ) - - instance_config = create_instance_config( - args.app_name, - args.unit_num, - args.config.image, - args.config.path, - args.config.labels, - args.registration_token, - ) - cloud_user_data = _CloudInitUserData( - instance_config=instance_config, - runner_env=env_contents, - pre_job_contents=pre_job_contents, - dockerhub_mirror=args.config.dockerhub_mirror, - proxies=args.config.charm_state.proxy_config, - ) - cloud_userdata_str = _generate_cloud_init_userdata( - templates_env=environment, - cloud_init_userdata=cloud_user_data, - ) - - with _create_connection(cloud_config=args.cloud_config) as conn: - runner_security_group = OpenstackRunnerManager._ensure_security_group(conn) - OpenstackRunnerManager._setup_runner_keypair(conn, instance_config.name) - - logger.info("Creating runner %s", instance_config.name) - try: - instance = conn.create_server( - name=instance_config.name, - image=instance_config.image_id, - key_name=instance_config.name, - flavor=args.config.flavor, - network=args.config.network, - security_groups=[runner_security_group["id"]], - userdata=cloud_userdata_str, - auto_ip=False, - timeout=CREATE_SERVER_TIMEOUT, - wait=True, - ) - except openstack.exceptions.ResourceTimeout as err: - logger.exception("Timeout creating OpenStack runner %s", instance_config.name) - try: - logger.info( - "Attempting to remove OpenStack runner %s that timeout on creation", - instance_config.name, - ) - conn.delete_server(name_or_id=instance_config.name, wait=True) - try: - conn.delete_keypair(instance_config.name) - except openstack.exceptions.SDKException: - logger.exception( - "Unable to delete OpenStack keypair %s", instance_config.name - ) - OpenstackRunnerManager._get_key_path(instance_config.name).unlink( - missing_ok=True - ) - except openstack.exceptions.SDKException: - logger.exception( - "Cleanup of creation failure runner %s has failed", instance_config.name - ) - # Reconcile will attempt to cleanup again prior to spawning new runners. - raise RunnerCreateError( - f"Timeout creating OpenStack runner {instance_config.name}" - ) from err - except openstack.exceptions.SDKException as err: - logger.exception("Failed to create OpenStack runner %s", instance_config.name) - raise RunnerCreateError( - f"Failed to create OpenStack runner {instance_config.name}" - ) from err - - logger.info("Waiting runner %s to come online", instance_config.name) - OpenstackRunnerManager._wait_until_runner_process_running(conn, instance.name) - logger.info("Finished creating runner %s", instance_config.name) - ts_after = time.time() - OpenstackRunnerManager._issue_runner_installed_metric( - app_name=args.app_name, - instance_config=instance_config, - install_end_ts=ts_after, - install_start_ts=ts_now, - ) - - @staticmethod - def _render_pre_job_contents( - charm_state: CharmState, templates_env: jinja2.Environment - ) -> str: - """Render the pre-job script contents. - - Args: - charm_state: The charm state object. - templates_env: The jinja template environment. - - Returns: - The rendered pre-job script contents. - """ - pre_job_contents_dict = { - "issue_metrics": True, - "metrics_exchange_path": str(METRICS_EXCHANGE_PATH), - "do_repo_policy_check": False, - } - if repo_policy_config := charm_state.charm_config.repo_policy_compliance: - repo_policy_client = RepoPolicyComplianceClient( - url=repo_policy_config.url, charm_token=repo_policy_config.token - ) - pre_job_contents_dict.update( - { - "repo_policy_base_url": repo_policy_client.base_url, - "repo_policy_one_time_token": repo_policy_client.get_one_time_token(), - "do_repo_policy_check": True, - } - ) - pre_job_contents = templates_env.get_template("pre-job.j2").render(pre_job_contents_dict) - return pre_job_contents - - @staticmethod - def _ensure_security_group(conn: OpenstackConnection) -> SecurityGroup: - """Ensure runner security group exists. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - The security group with the rules for runners. - """ - rule_exists_icmp = False - rule_exists_ssh = False - rule_exists_tmate_ssh = False - - security_group_list = conn.list_security_groups(filters={"name": SECURITY_GROUP_NAME}) - # Pick the first security_group returned. - security_group = next(iter(security_group_list), None) - - if security_group is None: - logger.info("Security group %s not found, creating it", SECURITY_GROUP_NAME) - security_group = conn.create_security_group( - name=SECURITY_GROUP_NAME, - description="For servers managed by the github-runner charm.", - ) - else: - existing_rules = security_group["security_group_rules"] - for rule in existing_rules: - if rule["protocol"] == "icmp": - logger.debug( - "Found ICMP rule in existing security group %s of ID %s", - SECURITY_GROUP_NAME, - security_group["id"], - ) - rule_exists_icmp = True - if ( - rule["protocol"] == "tcp" - and rule["port_range_min"] == rule["port_range_max"] == 22 - ): - logger.debug( - "Found SSH rule in existing security group %s of ID %s", - SECURITY_GROUP_NAME, - security_group["id"], - ) - rule_exists_ssh = True - if ( - rule["protocol"] == "tcp" - and rule["port_range_min"] == rule["port_range_max"] == 10022 - ): - logger.debug( - "Found tmate SSH rule in existing security group %s of ID %s", - SECURITY_GROUP_NAME, - security_group["id"], - ) - rule_exists_tmate_ssh = True - - if not rule_exists_icmp: - conn.create_security_group_rule( - secgroup_name_or_id=security_group["id"], - protocol="icmp", - direction="ingress", - ethertype="IPv4", - ) - if not rule_exists_ssh: - conn.create_security_group_rule( - secgroup_name_or_id=security_group["id"], - port_range_min="22", - port_range_max="22", - protocol="tcp", - direction="ingress", - ethertype="IPv4", - ) - if not rule_exists_tmate_ssh: - conn.create_security_group_rule( - secgroup_name_or_id=security_group["id"], - port_range_min="10022", - port_range_max="10022", - protocol="tcp", - direction="egress", - ethertype="IPv4", - ) - return security_group - - @staticmethod - def _setup_runner_keypair(conn: OpenstackConnection, name: str) -> None: - """Set up the SSH keypair for a runner. - - Args: - conn: The connection object to access OpenStack cloud. - name: The name of the runner. - """ - private_key_path = OpenstackRunnerManager._get_key_path(name) - - if private_key_path.exists(): - logger.warning("Existing private key file for %s found, removing it.", name) - private_key_path.unlink() - - keypair = conn.create_keypair(name=name) - private_key_path.write_text(keypair.private_key) - shutil.chown(private_key_path, user="ubuntu", group="ubuntu") - private_key_path.chmod(0o400) - - @retry(tries=10, delay=60, local_logger=logger) - @staticmethod - def _wait_until_runner_process_running(conn: OpenstackConnection, instance_name: str) -> None: - """Wait until the runner process is running. - - The waiting to done by the retry declarator. - - Args: - conn: The openstack connection instance. - instance_name: The name of the instance to wait on. - - Raises: - RunnerStartError: Unable perform health check of the runner application. - """ - try: - if not OpenstackRunnerManager._health_check( - conn=conn, server_name=instance_name, startup=True - ): - raise RunnerStartError( - ( - "Unable to find running process of runner application on openstack runner " - f"{instance_name}" - ) - ) - except TimeoutError as err: - raise RunnerStartError( - f"Unable to connect to openstack runner {instance_name}" - ) from err - - @staticmethod - def _issue_runner_installed_metric( - app_name: str, - instance_config: InstanceConfig, - install_start_ts: float, - install_end_ts: float, - ) -> None: - """Issue RunnerInstalled metric. - - Args: - app_name: The juju application name. - instance_config: The configuration values for Openstack instance. - install_start_ts: The timestamp when the installation started. - install_end_ts: The timestamp when the installation ended. - """ - try: - metric_events.issue_event( - event=metric_events.RunnerInstalled( - timestamp=install_start_ts, - flavor=app_name, - duration=install_end_ts - install_start_ts, - ), - ) - except IssueMetricEventError: - logger.exception("Failed to issue RunnerInstalled metric") - try: - storage = metrics_storage.create(instance_config.name) - except CreateMetricsStorageError: - logger.exception( - "Failed to create metrics storage for runner %s, " - "will not be able to issue all metrics.", - instance_config.name, - ) - else: - try: - (storage.path / RUNNER_INSTALLED_TS_FILE_NAME).write_text( - str(install_end_ts), encoding="utf-8" - ) - except FileNotFoundError: - logger.exception( - "Failed to write runner-installed.timestamp into metrics storage " - "for runner %s, will not be able to issue all metrics.", - instance_config.name, - ) - - def _remove_runners( - self, - conn: OpenstackConnection, - instance_names: Iterable[str], - remove_token: str | None = None, - num_to_remove: int | float | None = None, - ) -> None: - """Delete runners on Openstack. - - Removes the registered runner from Github if remove_token is provided. - - Args: - conn: The Openstack connection instance. - instance_names: The Openstack server names to delete. - remove_token: The GitHub runner remove token. - num_to_remove: Remove a specified number of runners. Remove all if None. - """ - if num_to_remove is None: - num_to_remove = float("inf") - - name_to_github_id = { - runner["name"]: runner["id"] - for runner in self._github.get_runner_github_info(self._config.path) - } - for instance_name in instance_names: - if num_to_remove < 1: - break - - github_id = name_to_github_id.get(instance_name, None) - self._remove_one_runner(conn, instance_name, github_id, remove_token) - - # Attempt to delete the keys. This is place at the end of deletion, so we can access - # the instances that failed to delete on previous tries. - try: - conn.delete_keypair(instance_name) - except openstack.exceptions.SDKException: - logger.exception("Unable to delete OpenStack keypair %s", instance_name) - OpenstackRunnerManager._get_key_path(instance_name).unlink(missing_ok=True) - num_to_remove -= 1 - - def _remove_one_runner( - self, - conn: OpenstackConnection, - instance_name: str, - github_id: int | None = None, - remove_token: str | None = None, - ) -> None: - """Remove one OpenStack runner. - - Args: - conn: The Openstack connection instance. - instance_name: The Openstack server name to delete. - github_id: The runner id on GitHub. - remove_token: The GitHub runner remove token. - """ - logger.info("Attempting to remove OpenStack runner %s", instance_name) - - server: Server | None = conn.get_server(name_or_id=instance_name) - - if server is not None: - logger.info( - "Pulling metrics and deleting server for OpenStack runner %s", instance_name - ) - self._pull_metrics(conn=conn, instance_name=instance_name) - self._remove_openstack_runner(conn, server, remove_token) - else: - logger.info( - "Not found server for OpenStack runner %s marked for deletion", instance_name - ) - - if github_id is not None: - try: - self._github.delete_runner(self._config.path, github_id) - except GithubClientError as exc: - logger.warning("Failed to remove runner from Github %s, %s", instance_name, exc) - # TODO: 2024-04-23: The broad except clause is for logging purposes. - # Will be removed in future versions. - except Exception: # pylint: disable=broad-exception-caught - logger.critical( - "Found unexpected exception, please contact the developers", exc_info=True - ) - - def _pull_metrics(self, conn: OpenstackConnection, instance_name: str) -> None: - """Pull metrics from the runner into the respective storage for the runner. - - Args: - conn: The Openstack connection instance. - instance_name: The Openstack server name. - """ - try: - storage = metrics_storage.get(instance_name) - except GetMetricsStorageError: - logger.exception( - "Failed to get shared metrics storage for runner %s, " - "will not be able to issue all metrics.", - instance_name, - ) - return - - try: - ssh_conn = self._get_ssh_connection(conn=conn, server_name=instance_name) - except _SSHError as exc: - logger.info("Failed to pull metrics for %s: %s", instance_name, exc) - return - - try: - self._pull_file( - ssh_conn=ssh_conn, - remote_path=str(METRICS_EXCHANGE_PATH / "pre-job-metrics.json"), - local_path=str(storage.path / "pre-job-metrics.json"), - max_size=MAX_METRICS_FILE_SIZE, - ) - self._pull_file( - ssh_conn=ssh_conn, - remote_path=str(METRICS_EXCHANGE_PATH / "post-job-metrics.json"), - local_path=str(storage.path / "post-job-metrics.json"), - max_size=MAX_METRICS_FILE_SIZE, - ) - return - except _PullFileError as exc: - logger.warning( - "Failed to pull metrics for %s: %s . Will not be able to issue all metrics", - instance_name, - exc, - ) - return - - def _pull_file( - self, ssh_conn: SSHConnection, remote_path: str, local_path: str, max_size: int - ) -> None: - """Pull file from the runner instance. - - Args: - ssh_conn: The SSH connection instance. - remote_path: The file path on the runner instance. - local_path: The local path to store the file. - max_size: If the file is larger than this, it will not be pulled. - - Raises: - _PullFileError: Unable to pull the file from the runner instance. - _SSHError: Issue with SSH connection. - """ - try: - result = ssh_conn.run(f"stat -c %s {remote_path}", warn=True) - except (NoValidConnectionsError, TimeoutError, paramiko.ssh_exception.SSHException) as exc: - raise _SSHError(reason=f"Unable to SSH into {ssh_conn.host}") from exc - if not result.ok: - logger.warning( - ( - "Unable to get file size of %s on instance %s, " - "exit code: %s, stdout: %s, stderr: %s" - ), - remote_path, - ssh_conn.host, - result.return_code, - result.stdout, - result.stderr, - ) - raise _PullFileError(reason=f"Unable to get file size of {remote_path}") - - stdout = result.stdout - try: - stdout.strip() - size = int(stdout) - if size > max_size: - raise _PullFileError( - reason=f"File size of {remote_path} too large {size} > {max_size}" - ) - except ValueError as exc: - raise _PullFileError(reason=f"Invalid file size for {remote_path}: {stdout}") from exc - - try: - ssh_conn.get(remote=remote_path, local=local_path) - except (NoValidConnectionsError, TimeoutError, paramiko.ssh_exception.SSHException) as exc: - raise _SSHError(reason=f"Unable to SSH into {ssh_conn.host}") from exc - except OSError as exc: - raise _PullFileError(reason=f"Unable to retrieve file {remote_path}") from exc - - def _remove_openstack_runner( - self, - conn: OpenstackConnection, - server: Server, - remove_token: str | None = None, - ) -> None: - """Remove a OpenStack server hosting the GitHub runner application. - - Args: - conn: The Openstack connection instance. - server: The Openstack server. - remove_token: The GitHub runner remove token. - """ - try: - self._run_github_removal_script(conn=conn, server=server, remove_token=remove_token) - except (TimeoutError, invoke.exceptions.UnexpectedExit, GithubRunnerRemoveError): - logger.warning( - "Failed to run runner removal script for %s", server.name, exc_info=True - ) - # TODO: 2024-04-23: The broad except clause is for logging purposes. - # Will be removed in future versions. - except Exception: # pylint: disable=broad-exception-caught - logger.critical( - "Found unexpected exception, please contact the developers", exc_info=True - ) - try: - if not conn.delete_server(name_or_id=server.name, wait=True, delete_ips=True): - logger.warning("Server does not exist %s", server.name) - except SDKException as exc: - logger.error("Something wrong deleting the server %s, %s", server.name, exc) - # TODO: 2024-04-23: The broad except clause is for logging purposes. - # Will be removed in future versions. - except Exception: # pylint: disable=broad-exception-caught - logger.critical( - "Found unexpected exception, please contact the developers", exc_info=True - ) - - def _run_github_removal_script( - self, conn: OpenstackConnection, server: Server, remove_token: str | None - ) -> None: - """Run Github runner removal script. - - Args: - conn: The Openstack connection instance. - server: The Openstack server instance. - remove_token: The GitHub instance removal token. - - Raises: - GithubRunnerRemoveError: Unable to remove runner from GitHub. - """ - if not remove_token: - return - try: - ssh_conn = OpenstackRunnerManager._get_ssh_connection( - conn=conn, server_name=server.name - ) - except _SSHError as exc: - logger.error( - "Unable to run GitHub removal script, server: %s, reason: %s", - server.name, - str(exc), - ) - raise GithubRunnerRemoveError( - f"Failed to remove runner {server.name} from Github." - ) from exc - - try: - result: invoke.runners.Result = ssh_conn.run( - f"{_CONFIG_SCRIPT_PATH} remove --token {remove_token}", - warn=True, - ) - if not result.ok: - logger.warning( - ( - "Unable to run removal script on instance %s, " - "exit code: %s, stdout: %s, stderr: %s" - ), - server.name, - result.return_code, - result.stdout, - result.stderr, - ) - return - # TODO: 2024-04-23: The broad except clause is for logging purposes. - # Will be removed in future versions. - except Exception: # pylint: disable=broad-exception-caught - logger.critical( - "Found unexpected exception, please contact the developers", exc_info=True - ) - - logger.warning("Failed to run GitHub runner removal script %s", server.name) - raise GithubRunnerRemoveError(f"Failed to remove runner {server.name} from Github.") - - def _clean_up_keys_files( - self, conn: OpenstackConnection, exclude_instances: Iterable[str] - ) -> None: - """Delete all SSH key files except the specified instances. - - Args: - conn: The Openstack connection instance. - exclude_instances: The keys of these instance will not be deleted. - """ - logger.info("Cleaning up SSH key files") - exclude_filename = set( - OpenstackRunnerManager._get_key_path(instance) for instance in exclude_instances - ) - - total = 0 - deleted = 0 - for path in _SSH_KEY_PATH.iterdir(): - # Find key file from this application. - if ( - path.is_file() - and path.name.startswith(self.instance_name) - and path.name.endswith(".key") - ): - total += 1 - if path.name in exclude_filename: - continue - - keypair_name = path.name.split(".")[0] - try: - conn.delete_keypair(keypair_name) - except openstack.exceptions.SDKException: - logger.warning( - "Unable to delete OpenStack keypair associated with deleted key file %s ", - path.name, - ) - - path.unlink() - deleted += 1 - logger.info("Found %s key files, clean up %s key files", total, deleted) - - def _clean_up_openstack_keypairs( - self, conn: OpenstackConnection, exclude_instances: Iterable[str] - ) -> None: - """Delete all OpenStack keypairs except the specified instances. - - Args: - conn: The Openstack connection instance. - exclude_instances: The keys of these instance will not be deleted. - """ - logger.info("Cleaning up openstack keypairs") - keypairs = conn.list_keypairs() - for key in keypairs: - # The `name` attribute is of resource.Body type. - if key.name and str(key.name).startswith(self.instance_name): - if str(key.name) in exclude_instances: - continue - - try: - conn.delete_keypair(key.name) - except openstack.exceptions.SDKException: - logger.warning( - "Unable to delete OpenStack keypair associated with deleted key file %s ", - key.name, - ) - - def _clean_up_runners( - self, conn: OpenstackConnection, runner_by_health: RunnerNameByHealth, remove_token: str - ) -> None: - """Clean up offline or unhealthy runners. - - Args: - conn: The openstack connection instance. - runner_by_health: The runner status grouped by health. - remove_token: The GitHub runner remove token. - - """ - github_info = self.get_github_runner_info() - online_runners = [runner.runner_name for runner in github_info if runner.online] - offline_runners = [runner.runner_name for runner in github_info if not runner.online] - busy_runners = [runner.runner_name for runner in github_info if runner.busy] - logger.info( - "Found %s online and %s offline openstack runners, %s of the runners are busy", - len(online_runners), - len(offline_runners), - len(busy_runners), - ) - logger.debug("Online runner: %s", online_runners) - logger.debug("Offline runner: %s", offline_runners) - logger.debug("Busy runner: %s", busy_runners) - - healthy_runners_set = set(runner_by_health.healthy) - busy_runners_set = set(busy_runners) - busy_unhealthy_runners = set(runner_by_health.unhealthy).intersection(busy_runners_set) - if busy_unhealthy_runners: - logger.warning("Found unhealthy busy runners %s", busy_unhealthy_runners) - - # Clean up offline (SHUTOFF) runners or unhealthy (no connection/cloud-init script) - # runners. - # Possible for a healthy runner to be appear as offline for sometime as GitHub can be - # slow to update the status. - # For busy runners let GitHub decide whether the runner should be removed. - instance_to_remove = tuple( - runner - for runner in (*runner_by_health.unhealthy, *offline_runners) - if runner not in healthy_runners_set and runner not in busy_runners_set - ) - logger.debug("Removing following runners with issues %s", instance_to_remove) - self._remove_runners( - conn=conn, instance_names=instance_to_remove, remove_token=remove_token - ) - # Clean up orphan keys, e.g., If openstack instance is removed externally the key - # would not be deleted. - self._clean_up_keys_files(conn, runner_by_health.healthy) - self._clean_up_openstack_keypairs(conn, runner_by_health.healthy) - - def _scale( - self, - quantity: int, - conn: OpenstackConnection, - runner_by_health: RunnerNameByHealth, - remove_token: str, - ) -> int: - """Scale the number of runners. - - Args: - quantity: The number of intended runners. - conn: The openstack connection instance. - runner_by_health: The runner status grouped by health. - remove_token: The GitHub runner remove token. - - Returns: - The change in number of runners. - """ - # Get the number of OpenStack servers. - # This is not calculated due to there might be removal failures. - servers = self._get_openstack_instances(conn) - delta = quantity - len(servers) - registration_token = self._github.get_runner_registration_token(path=self._config.path) - - # Spawn new runners - if delta > 0: - logger.info("Creating %s OpenStack runners", delta) - args = [ - OpenstackRunnerManager._CreateRunnerArgs( - app_name=self.app_name, - config=self._config, - cloud_config=self._cloud_config, - registration_token=registration_token, - unit_num=self.unit_num, - ) - for _ in range(delta) - ] - with Pool(processes=min(delta, 10)) as pool: - pool.map( - func=OpenstackRunnerManager._create_runner, - iterable=args, - ) - - elif delta < 0: - logger.info("Removing %s OpenStack runners", delta) - self._remove_runners( - conn=conn, - instance_names=runner_by_health.healthy, - remove_token=remove_token, - num_to_remove=abs(delta), - ) - else: - logger.info("No changes to number of runners needed") - - return delta - - def _issue_reconciliation_metrics( - self, - reconciliation_start_ts: float, - reconciliation_end_ts: float, - ) -> None: - """Issue all reconciliation related metrics. - - This includes the metrics for the runners and the reconciliation metric itself. - - Args: - reconciliation_start_ts: The timestamp of when reconciliation started. - reconciliation_end_ts: The timestamp of when reconciliation ended. - """ - with _create_connection(self._cloud_config) as conn: - runner_states = self._get_openstack_runner_status(conn) - - metric_stats = self._issue_runner_metrics(conn) - self._issue_reconciliation_metric( - metric_stats=metric_stats, - reconciliation_start_ts=reconciliation_start_ts, - reconciliation_end_ts=reconciliation_end_ts, - runner_states=runner_states, - ) - - def _issue_runner_metrics(self, conn: OpenstackConnection) -> IssuedMetricEventsStats: - """Issue runner metrics. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - The stats of issued metric events. - """ - total_stats: IssuedMetricEventsStats = {} - - try: - openstack_instances = self._get_openstack_instances(conn) - except openstack.exceptions.SDKException: - logger.exception( - "Failed to get openstack instances to ignore when extracting metrics." - " Will not issue runner metrics" - ) - return total_stats - - logger.debug( - "Found following openstack instances before extracting metrics: %s", - openstack_instances, - ) - # Don't extract metrics for instances which are still there, as it might be - # the case that the metrics have not yet been pulled - # (they get pulled right before server termination). - instance_names = {instance.name for instance in openstack_instances} - - for extracted_metrics in runner_metrics.extract( - metrics_storage_manager=metrics_storage, - runners=instance_names, - ): - try: - job_metrics = github_metrics.job( - github_client=self._github, - pre_job_metrics=extracted_metrics.pre_job, - runner_name=extracted_metrics.runner_name, - ) - except GithubMetricsError: - logger.exception("Failed to calculate job metrics") - job_metrics = None - - issued_events = runner_metrics.issue_events( - runner_metrics=extracted_metrics, - job_metrics=job_metrics, - flavor=self.app_name, - ) - for event_type in issued_events: - total_stats[event_type] = total_stats.get(event_type, 0) + 1 - return total_stats - - def _issue_reconciliation_metric( - self, - metric_stats: IssuedMetricEventsStats, - reconciliation_start_ts: float, - reconciliation_end_ts: float, - runner_states: RunnerNameByHealth, - ) -> None: - """Issue reconciliation metric. - - Args: - metric_stats: The stats of issued metric events. - reconciliation_start_ts: The timestamp of when reconciliation started. - reconciliation_end_ts: The timestamp of when reconciliation ended. - runner_states: The states of the runners. - """ - try: - github_info = self.get_github_runner_info() - except GithubApiError: - logger.exception( - "Failed to retrieve github info for reconciliation metric. " - "Will not issue reconciliation metric." - ) - return - - online_runners = [runner for runner in github_info if runner.online] - offline_runner_names = {runner.runner_name for runner in github_info if not runner.online} - active_runner_names = {runner.runner_name for runner in online_runners if runner.busy} - healthy_runners = set(runner_states.healthy) - - active_count = len(active_runner_names) - idle_online_count = len(online_runners) - active_count - idle_offline_count = len((offline_runner_names & healthy_runners) - active_runner_names) - - try: - metric_events.issue_event( - event=metric_events.Reconciliation( - timestamp=time.time(), - flavor=self.app_name, - crashed_runners=metric_stats.get(metric_events.RunnerStart, 0) - - metric_stats.get(metric_events.RunnerStop, 0), - idle_runners=idle_online_count + idle_offline_count, - duration=reconciliation_end_ts - reconciliation_start_ts, - ) - ) - except IssueMetricEventError: - logger.exception("Failed to issue Reconciliation metric") - - def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: - """Flush Openstack servers. - - 1. Kill the processes depending on flush mode. - 2. Get unhealthy runners after process purging. - 3. Delete unhealthy runners. - - Args: - mode: The mode to determine which runner to flush. - - Returns: - The number of runners flushed. - """ - logger.info("Flushing OpenStack all runners") - with _create_connection(self._cloud_config) as conn: - self._kill_runner_processes(conn=conn, mode=mode) - runner_by_health = self._get_openstack_runner_status(conn) - remove_token = self._github.get_runner_remove_token(path=self._config.path) - self._remove_runners( - conn=conn, - instance_names=runner_by_health.unhealthy, - remove_token=remove_token, - ) - return len(runner_by_health.unhealthy) - - def _kill_runner_processes(self, conn: OpenstackConnection, mode: FlushMode) -> None: - """Kill runner application that are not running any jobs. - - Runners that have not picked up a job has - 1. no Runner.Worker process - 2. no pre-run.sh job process - - Args: - conn: The connection object to access OpenStack cloud. - mode: The flush mode to determine which runner processes to kill. - - Raises: - NotImplementedError: If unsupported flush mode has been passed. - """ - killer_command: str - match mode: - case FlushMode.FLUSH_IDLE: - # only kill Runner.Listener if Runner.Worker does not exist. - killer_command = ( - "! pgrep -x Runner.Worker && pgrep -x Runner.Listener && " - "kill $(pgrep -x Runner.Listener)" - ) - case FlushMode.FLUSH_BUSY: - # kill both Runner.Listener and Runner.Worker processes. - # This kills pre-job.sh, a child process of Runner.Worker. - killer_command = ( - "pgrep -x Runner.Listener && kill $(pgrep -x Runner.Listener);" - "pgrep -x Runner.Worker && kill $(pgrep -x Runner.Worker);" - ) - case _: - raise NotImplementedError(f"Unsupported flush mode {mode}") - - servers = self._get_openstack_instances(conn=conn) - for server in servers: - ssh_conn: SSHConnection = self._get_ssh_connection(conn=conn, server_name=server.name) - result: invoke.runners.Result = ssh_conn.run( - killer_command, - warn=True, - ) - if not result.ok: - logger.warning("Failed to kill runner process. Instance: %s", server.name) - continue - logger.info("Successfully killed runner process. Instance: %s", server.name) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index 6323b65fa..aa03b0ec3 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -16,12 +16,13 @@ import paramiko.ssh_exception from fabric import Connection as SSHConnection -from charm_state import GithubOrg +from charm_state import GitHubOrg from errors import ( CreateMetricsStorageError, GetMetricsStorageError, IssueMetricEventError, KeyfileError, + MissingServerConfigError, OpenStackError, RunnerCreateError, RunnerStartError, @@ -40,7 +41,6 @@ from metrics import runner as runner_metrics from metrics import storage as metrics_storage from openstack_cloud.openstack_cloud import OpenstackCloud, OpenstackInstance -from openstack_cloud.openstack_manager import GithubRunnerRemoveError from repo_policy_compliance_client import RepoPolicyComplianceClient from utilities import retry @@ -60,6 +60,10 @@ CREATE_SERVER_TIMEOUT = 5 * 60 +class _GithubRunnerRemoveError(Exception): + """Represents an error while SSH into a runner and running the remove script.""" + + class _PullFileError(Exception): """Represents an error while pulling a file from the runner instance.""" @@ -105,7 +109,7 @@ class _RunnerHealth: unhealthy: tuple[OpenstackInstance, ...] -class OpenstackRunnerManager(CloudRunnerManager): +class OpenStackRunnerManager(CloudRunnerManager): """Manage self-hosted runner on OpenStack cloud. Attributes: @@ -115,21 +119,25 @@ class OpenstackRunnerManager(CloudRunnerManager): # Ignore "Too many arguments", as the class requires a lot of configurations. def __init__( # pylint: disable=R0913 self, + manager_name: str, prefix: str, cloud_config: OpenStackCloudConfig, - server_config: OpenStackServerConfig, + server_config: OpenStackServerConfig | None, runner_config: GitHubRunnerConfig, service_config: SupportServiceConfig, ) -> None: """Construct the object. Args: + manager_name: A name to identify this manager. prefix: The prefix to runner name. cloud_config: The configuration for OpenStack authorisation. - server_config: The configuration for creating OpenStack server. + server_config: The configuration for creating OpenStack server. Unable to create + runner if None. runner_config: The configuration for the runner. service_config: The configuration of supporting services of the runners. """ + self._manager_name = manager_name self._prefix = prefix self._cloud_config = cloud_config self._server_config = server_config @@ -157,13 +165,17 @@ def create_runner(self, registration_token: str) -> InstanceId: registration_token: The GitHub registration token for registering runners. Raises: + MissingServerConfigError: Unable to create runner due to missing configuration. RunnerCreateError: Unable to create runner due to OpenStack issues. Returns: Instance ID of the runner. """ + if self._server_config is None: + raise MissingServerConfigError("Missing server configuration to create runners") + start_timestamp = time.time() - instance_id = OpenstackRunnerManager._generate_instance_id() + instance_id = OpenStackRunnerManager._generate_instance_id() instance_name = self._openstack_cloud.get_server_name(instance_id=instance_id) cloud_init = self._generate_cloud_init( instance_name=instance_name, registration_token=registration_token @@ -183,9 +195,9 @@ def create_runner(self, registration_token: str) -> InstanceId: self._wait_runner_running(instance) end_timestamp = time.time() - OpenstackRunnerManager._issue_runner_installed_metric( + OpenStackRunnerManager._issue_runner_installed_metric( name=instance_name, - flavor=self.name_prefix, + flavor=self._manager_name, install_start_timestamp=start_timestamp, install_end_timestamp=end_timestamp, ) @@ -241,7 +253,9 @@ def get_runners( ] if states is None: return tuple(instance_list) - return tuple(instance for instance in instance_list if instance.state in states) + + state_set = set(states) + return tuple(instance for instance in instance_list if instance.state in state_set) def delete_runner( self, instance_id: InstanceId, remove_token: str @@ -326,10 +340,10 @@ def _delete_runner(self, instance: OpenstackInstance, remove_token: str) -> None self._pull_runner_metrics(instance.server_name, ssh_conn) try: - OpenstackRunnerManager._run_runner_removal_script( + OpenStackRunnerManager._run_runner_removal_script( instance.server_name, ssh_conn, remove_token ) - except GithubRunnerRemoveError: + except _GithubRunnerRemoveError: logger.warning( "Unable to run github runner removal script for %s", instance.server_name, @@ -426,7 +440,7 @@ def _generate_cloud_init(self, instance_name: str, registration_token: str) -> s pre_job_contents = jinja.get_template("pre-job.j2").render(pre_job_contents_dict) runner_group = None - if isinstance(self._runner_config.github_path, GithubOrg): + if isinstance(self._runner_config.github_path, GitHubOrg): runner_group = self._runner_config.github_path.group aproxy_address = ( self._service_config.proxy_config.aproxy_address @@ -452,9 +466,10 @@ def _get_repo_policy_compliance_client(self) -> RepoPolicyComplianceClient | Non Returns: The repo policy compliance client. """ - if self._service_config.repo_policy_url and self._service_config.repo_policy_token: + if self._service_config.repo_policy_compliance is not None: return RepoPolicyComplianceClient( - self._service_config.repo_policy_url, self._service_config.repo_policy_token + self._service_config.repo_policy_compliance.url, + self._service_config.repo_policy_compliance.token, ) return None @@ -535,7 +550,7 @@ def _health_check(self, instance: OpenstackInstance) -> bool: "SSH connection failure with %s during health check", instance.server_name ) raise - return OpenstackRunnerManager._run_health_check(ssh_conn, instance.server_name) + return OpenStackRunnerManager._run_health_check(ssh_conn, instance.server_name) @staticmethod def _run_health_check(ssh_conn: SSHConnection, name: str) -> bool: @@ -686,13 +701,13 @@ def _pull_runner_metrics(name: str, ssh_conn: SSHConnection) -> None: return try: - OpenstackRunnerManager._ssh_pull_file( + OpenStackRunnerManager._ssh_pull_file( ssh_conn=ssh_conn, remote_path=str(METRICS_EXCHANGE_PATH / "pre-job-metrics.json"), local_path=str(storage.path / "pre-job-metrics.json"), max_size=MAX_METRICS_FILE_SIZE, ) - OpenstackRunnerManager._ssh_pull_file( + OpenStackRunnerManager._ssh_pull_file( ssh_conn=ssh_conn, remote_path=str(METRICS_EXCHANGE_PATH / "post-job-metrics.json"), local_path=str(storage.path / "post-job-metrics.json"), @@ -775,7 +790,7 @@ def _run_runner_removal_script( remove_token: The GitHub instance removal token. Raises: - GithubRunnerRemoveError: Unable to remove runner from GitHub. + _GithubRunnerRemoveError: Unable to remove runner from GitHub. """ try: result = ssh_conn.run( @@ -795,12 +810,12 @@ def _run_runner_removal_script( result.stdout, result.stderr, ) - raise GithubRunnerRemoveError(f"Failed to remove runner {instance_name} from Github.") + raise _GithubRunnerRemoveError(f"Failed to remove runner {instance_name} from Github.") except ( TimeoutError, paramiko.ssh_exception.NoValidConnectionsError, paramiko.ssh_exception.SSHException, ) as exc: - raise GithubRunnerRemoveError( + raise _GithubRunnerRemoveError( f"Failed to remove runner {instance_name} from Github." ) from exc diff --git a/src/runner.py b/src/runner.py index 4610faded..61a12115c 100644 --- a/src/runner.py +++ b/src/runner.py @@ -23,7 +23,7 @@ import yaml import shared_fs -from charm_state import Arch, GithubOrg, SSHDebugConnection, VirtualMachineResources +from charm_state import Arch, GitHubOrg, SSHDebugConnection, VirtualMachineResources from errors import ( CreateMetricsStorageError, GithubClientError, @@ -838,7 +838,7 @@ def _register_runner(self, registration_token: str, labels: Sequence[str]) -> No self.instance.name, ] - if isinstance(self.config.path, GithubOrg): + if isinstance(self.config.path, GitHubOrg): register_cmd += ["--runnergroup", self.config.path.group] logger.info("Executing registration command...") diff --git a/src/runner_manager.py b/src/runner_manager.py index 8d68a68c9..31c30ef85 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -41,7 +41,12 @@ from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME from repo_policy_compliance_client import RepoPolicyComplianceClient from runner import LXD_PROFILE_YAML, CreateRunnerConfig, Runner, RunnerConfig, RunnerStatus -from runner_manager_type import FlushMode, RunnerInfo, RunnerManagerClients, RunnerManagerConfig +from runner_manager_type import ( + LXDFlushMode, + LXDRunnerManagerConfig, + RunnerInfo, + RunnerManagerClients, +) from runner_type import ProxySetting as RunnerProxySetting from runner_type import RunnerNameByHealth from utilities import execute_command, retry, set_env_var @@ -56,7 +61,7 @@ IssuedMetricEventsStats = dict[Type[metric_events.Event], int] -class RunnerManager: +class LXDRunnerManager: """Manage a group of runners according to configuration. Attributes: @@ -71,7 +76,7 @@ def __init__( self, app_name: str, unit: int, - runner_manager_config: RunnerManagerConfig, + runner_manager_config: LXDRunnerManagerConfig, ) -> None: """Construct RunnerManager object for creating and managing runners. @@ -159,7 +164,7 @@ def update_runner_bin(self, binary: RunnerApplication) -> None: try: # Delete old version of runner binary. - RunnerManager.runner_bin_path.unlink(missing_ok=True) + LXDRunnerManager.runner_bin_path.unlink(missing_ok=True) except OSError as err: logger.exception("Unable to perform file operation on the runner binary path") raise RunnerBinaryError("File operation failed on the runner binary path") from err @@ -182,7 +187,7 @@ def update_runner_bin(self, binary: RunnerApplication) -> None: sha256 = hashlib.sha256() - with RunnerManager.runner_bin_path.open(mode="wb") as file: + with LXDRunnerManager.runner_bin_path.open(mode="wb") as file: # Process with chunk_size of 128 KiB. for chunk in response.iter_content(chunk_size=128 * 1024, decode_unicode=False): file.write(chunk) @@ -267,7 +272,7 @@ def _create_runner( config=CreateRunnerConfig( image=self.config.image, resources=resources, - binary_path=RunnerManager.runner_bin_path, + binary_path=LXDRunnerManager.runner_bin_path, registration_token=registration_token, arch=self.config.charm_state.arch, ) @@ -309,7 +314,7 @@ def _create_runner( config=CreateRunnerConfig( image=self.config.image, resources=resources, - binary_path=RunnerManager.runner_bin_path, + binary_path=LXDRunnerManager.runner_bin_path, registration_token=registration_token, arch=self.config.charm_state.arch, ) @@ -447,7 +452,7 @@ def _spawn_new_runners(self, count: int, resources: VirtualMachineResources) -> Raises: RunnerCreateError: If there was an error spawning new runner. """ - if not RunnerManager.runner_bin_path.exists(): + if not LXDRunnerManager.runner_bin_path.exists(): raise RunnerCreateError("Unable to create runner due to missing runner binary.") logger.info("Getting registration token for GitHub runners.") registration_token = self._clients.github.get_runner_registration_token(self.config.path) @@ -619,7 +624,7 @@ def _runners_in_pre_job(self) -> bool: return False return True - def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: + def flush(self, mode: LXDFlushMode = LXDFlushMode.FLUSH_IDLE) -> int: """Remove existing runners. Args: @@ -636,7 +641,7 @@ def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: remove_token = self._clients.github.get_runner_remove_token(self.config.path) except GithubClientError: logger.exception("Failed to get remove-token to unregister runners from GitHub.") - if mode != FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK: + if mode != LXDFlushMode.FORCE_FLUSH_WAIT_REPO_CHECK: raise logger.info("Proceeding with flush without remove-token.") remove_token = None @@ -656,9 +661,9 @@ def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: logger.info(REMOVED_RUNNER_LOG_STR, runner.config.name) if mode in ( - FlushMode.FLUSH_IDLE_WAIT_REPO_CHECK, - FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK, - FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK, + LXDFlushMode.FLUSH_IDLE_WAIT_REPO_CHECK, + LXDFlushMode.FLUSH_BUSY_WAIT_REPO_CHECK, + LXDFlushMode.FORCE_FLUSH_WAIT_REPO_CHECK, ): for _ in range(5): if not self._runners_in_pre_job(): @@ -673,9 +678,9 @@ def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: ) if mode in ( - FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK, - FlushMode.FLUSH_BUSY, - FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK, + LXDFlushMode.FLUSH_BUSY_WAIT_REPO_CHECK, + LXDFlushMode.FLUSH_BUSY, + LXDFlushMode.FORCE_FLUSH_WAIT_REPO_CHECK, ): busy_runners = [runner for runner in self._get_runners() if runner.status.exist] diff --git a/src/runner_manager_type.py b/src/runner_manager_type.py index f3a2112f5..95f8edcc3 100644 --- a/src/runner_manager_type.py +++ b/src/runner_manager_type.py @@ -10,14 +10,14 @@ import jinja2 -from charm_state import CharmState, GithubPath, ReactiveConfig +from charm_state import CharmState, GitHubPath, ReactiveConfig from github_client import GithubClient from github_type import GitHubRunnerStatus from lxd import LxdClient from repo_policy_compliance_client import RepoPolicyComplianceClient -class FlushMode(Enum): +class LXDFlushMode(Enum): """Strategy for flushing runners. During pre-job (repo-check), the runners are marked as idle and if the pre-job fails, the @@ -61,7 +61,7 @@ class RunnerManagerClients: @dataclass # The instance attributes are all required. -class RunnerManagerConfig: # pylint: disable=too-many-instance-attributes +class LXDRunnerManagerConfig: # pylint: disable=too-many-instance-attributes """Configuration of runner manager. Attributes: @@ -81,7 +81,7 @@ class RunnerManagerConfig: # pylint: disable=too-many-instance-attributes charm_state: CharmState image: str lxd_storage_path: Path - path: GithubPath + path: GitHubPath service_token: str token: str dockerhub_mirror: str | None = None @@ -113,7 +113,7 @@ class OpenstackRunnerManagerConfig: # pylint: disable=too-many-instance-attribu """ charm_state: CharmState - path: GithubPath + path: GitHubPath labels: Iterable[str] token: str flavor: str diff --git a/src/runner_type.py b/src/runner_type.py index 86769eafd..92560cbcf 100644 --- a/src/runner_type.py +++ b/src/runner_type.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Optional -from charm_state import GithubPath, SSHDebugConnection +from charm_state import GitHubPath, SSHDebugConnection @dataclass @@ -64,7 +64,7 @@ class RunnerConfig: # pylint: disable=too-many-instance-attributes labels: tuple[str] lxd_storage_path: Path name: str - path: GithubPath + path: GitHubPath proxies: ProxySetting dockerhub_mirror: str | None = None ssh_debug_connections: list[SSHDebugConnection] | None = None diff --git a/tests/integration/helpers/common.py b/tests/integration/helpers/common.py index 16622c038..495c952b3 100644 --- a/tests/integration/helpers/common.py +++ b/tests/integration/helpers/common.py @@ -36,7 +36,7 @@ TOKEN_CONFIG_NAME, VIRTUAL_MACHINES_CONFIG_NAME, ) -from runner_manager import RunnerManager +from runner_manager import LXDRunnerManager from tests.status_name import ACTIVE DISPATCH_TEST_WORKFLOW_FILENAME = "workflow_dispatch_test.yaml" @@ -93,7 +93,7 @@ async def check_runner_binary_exists(unit: Unit) -> bool: Returns: Whether the runner binary file exists in the charm. """ - return_code, _, _ = await run_in_unit(unit, f"test -f {RunnerManager.runner_bin_path}") + return_code, _, _ = await run_in_unit(unit, f"test -f {LXDRunnerManager.runner_bin_path}") return return_code == 0 @@ -141,10 +141,10 @@ async def remove_runner_bin(unit: Unit) -> None: Args: unit: Unit instance to check for the LXD profile. """ - await run_in_unit(unit, f"rm {RunnerManager.runner_bin_path}") + await run_in_unit(unit, f"rm {LXDRunnerManager.runner_bin_path}") # No file should exists under with the filename. - return_code, _, _ = await run_in_unit(unit, f"test -f {RunnerManager.runner_bin_path}") + return_code, _, _ = await run_in_unit(unit, f"test -f {LXDRunnerManager.runner_bin_path}") assert return_code != 0 diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index b2d7624a6..c15afd5a5 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -2,6 +2,7 @@ # See LICENSE file for licensing details. import logging import secrets +from asyncio import sleep from typing import Optional, TypedDict, cast import openstack.connection @@ -10,6 +11,7 @@ from openstack.compute.v2.server import Server from charm_state import VIRTUAL_MACHINES_CONFIG_NAME +from openstack_cloud.openstack_cloud import OpenstackCloud from tests.integration.helpers.common import InstanceHelper, reconcile, run_in_unit, wait_for logger = logging.getLogger(__name__) @@ -40,8 +42,9 @@ async def expose_to_instance( unit: The juju unit of the github-runner charm. port: The port on the juju machine to expose to the runner. """ - runner = self._get_runner(unit=unit) + runner = self._get_single_runner(unit=unit) assert runner, f"Runner not found for unit {unit.name}" + logger.info("[TEST SETUP] Exposing port %s on %s", port, runner.name) network_address_list = runner.addresses.values() logger.warning(network_address_list) assert ( @@ -55,9 +58,24 @@ async def expose_to_instance( break assert ip, f"Failed to get IP address for OpenStack server {runner.name}" - ssh_cmd = f'ssh -fNT -R {port}:localhost:{port} -i /home/ubuntu/.ssh/runner-{runner.name}.key -o "StrictHostKeyChecking no" -o "ControlPersist yes" ubuntu@{ip} &' + key_path = OpenstackCloud._get_key_path(runner.name) + exit_code, _, _ = await run_in_unit(unit, f"ls {key_path}") + assert exit_code == 0, f"Unable to find key file {key_path}" + ssh_cmd = f'ssh -fNT -R {port}:localhost:{port} -i {key_path} -o "StrictHostKeyChecking no" -o "ControlPersist yes" ubuntu@{ip} &' exit_code, _, stderr = await run_in_unit(unit, ssh_cmd) - assert exit_code == 0, f"Error in SSH remote forwarding of port {port}: {stderr}" + assert ( + exit_code == 0 + ), f"Error in starting background process of SSH remote forwarding of port {port}: {stderr}" + + await sleep(1) + for _ in range(6): + exit_code, _, _ = await self.run_in_instance( + unit=unit, command=f"nc -z localhost {port}" + ) + if exit_code == 0: + return + await sleep(10) + assert False, f"Exposing the port {port} failed" async def run_in_instance( self, @@ -79,8 +97,9 @@ async def run_in_instance( Returns: Tuple of return code, stdout and stderr. """ - runner = self._get_runner(unit=unit) + runner = self._get_single_runner(unit=unit) assert runner, f"Runner not found for unit {unit.name}" + logger.info("[TEST SETUP] Run command %s on %s", command, runner.name) network_address_list = runner.addresses.values() logger.warning(network_address_list) assert ( @@ -94,7 +113,10 @@ async def run_in_instance( break assert ip, f"Failed to get IP address for OpenStack server {runner.name}" - ssh_cmd = f'ssh -i /home/ubuntu/.ssh/runner-{runner.name}.key -o "StrictHostKeyChecking no" ubuntu@{ip} {command}' + key_path = OpenstackCloud._get_key_path(runner.name) + exit_code, _, _ = await run_in_unit(unit, f"ls {key_path}") + assert exit_code == 0, f"Unable to find key file {key_path}" + ssh_cmd = f'ssh -i {key_path} -o "StrictHostKeyChecking no" ubuntu@{ip} {command}' ssh_cmd_as_ubuntu_user = f"su - ubuntu -c '{ssh_cmd}'" logging.warning("ssh_cmd: %s", ssh_cmd_as_ubuntu_user) exit_code, stdout, stderr = await run_in_unit(unit, ssh_cmd, timeout) @@ -152,12 +174,14 @@ async def _get_runner_names(self, unit: Unit) -> tuple[str, ...]: Returns: Tuple of runner names. """ - runner = self._get_runner(unit) + runner = self._get_single_runner(unit) assert runner, "Failed to find runner server" return (cast(str, runner.name),) - def _get_runner(self, unit: Unit) -> Server | None: - """Get the runner server. + def _get_single_runner(self, unit: Unit) -> Server | None: + """Get the only runner for the unit. + + This method asserts for exactly one runner for the unit. Args: unit: The unit to get the runner for. @@ -166,14 +190,12 @@ def _get_runner(self, unit: Unit) -> Server | None: The runner server. """ servers: list[Server] = self.openstack_connection.list_servers() - runner = None unit_name_without_slash = unit.name.replace("/", "-") - for server in servers: - if server.name.startswith(unit_name_without_slash): - runner = server - break - - return runner + runners = [server for server in servers if server.name.startswith(unit_name_without_slash)] + assert ( + len(runners) == 1 + ), f"In {unit.name} found more than one runners or no runners: {runners}" + return runners[0] async def setup_repo_policy( @@ -214,6 +236,13 @@ async def setup_repo_policy( await instance_helper.ensure_charm_has_runner(app=app) await instance_helper.expose_to_instance(unit, 8080) + # This tests the connection to the repo policy compliance, not a health check of service. + await instance_helper.run_in_instance( + unit=unit, + command="curl http://localhost:8080", + assert_on_failure=True, + assert_msg="Unable to reach the repo policy compliance server setup", + ) async def _install_repo_policy( @@ -247,7 +276,7 @@ async def _install_repo_policy( ) await run_in_unit( unit, - f'sudo -u ubuntu HTTPS_PROXY={https_proxy if https_proxy else ""} pip install --proxy http://squid.internal:3128 -r /home/ubuntu/repo_policy_compliance/requirements.txt', + f'sudo -u ubuntu HTTPS_PROXY={https_proxy if https_proxy else ""} pip install {f"--proxy {https_proxy}" if https_proxy else ""} -r /home/ubuntu/repo_policy_compliance/requirements.txt', assert_on_failure=True, assert_msg="Failed to install repo-policy-compliance requirements", ) diff --git a/tests/integration/test_charm_scheduled_events.py b/tests/integration/test_charm_scheduled_events.py index aa4a9f1b3..5e9819f23 100644 --- a/tests/integration/test_charm_scheduled_events.py +++ b/tests/integration/test_charm_scheduled_events.py @@ -13,7 +13,7 @@ from juju.application import Application from juju.model import Model -from runner_manager import RunnerManager +from runner_manager import LXDRunnerManager from tests.integration.helpers.common import check_runner_binary_exists from tests.integration.helpers.lxd import get_runner_names, run_in_unit, wait_till_num_of_runners from tests.status_name import ACTIVE @@ -40,7 +40,7 @@ async def test_update_interval(model: Model, app_scheduled_events: Application) unit = app_scheduled_events.units[0] assert await check_runner_binary_exists(unit) - ret_code, stdout, stderr = await run_in_unit(unit, f"rm -f {RunnerManager.runner_bin_path}") + ret_code, stdout, stderr = await run_in_unit(unit, f"rm -f {LXDRunnerManager.runner_bin_path}") assert ret_code == 0, f"Failed to remove runner binary {stdout} {stderr}" assert not await check_runner_binary_exists(unit) diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index b20426ca0..63b7204b3 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -17,7 +17,7 @@ from github.Workflow import Workflow from openstack.connection import Connection as OpenstackConnection -from charm_state import GithubPath, ProxyConfig, parse_github_path +from charm_state import GitHubPath, ProxyConfig, parse_github_path from manager.cloud_runner_manager import CloudRunnerState, GitHubRunnerConfig, SupportServiceConfig from manager.github_runner_manager import GitHubRunnerState from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig @@ -25,7 +25,7 @@ from openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH from openstack_cloud.openstack_runner_manager import ( OpenStackCloudConfig, - OpenstackRunnerManager, + OpenStackRunnerManager, OpenStackServerConfig, ) from tests.integration.helpers.common import ( @@ -64,7 +64,7 @@ def log_dir_base_path_fixture( @pytest.fixture(scope="module", name="github_path") -def github_path_fixture(path: str) -> GithubPath: +def github_path_fixture(path: str) -> GitHubPath: return parse_github_path(path, "Default") @@ -92,11 +92,11 @@ async def openstack_runner_manager_fixture( openstack_test_image: str, flavor_name: str, network_name: str, - github_path: GithubPath, + github_path: GitHubPath, proxy_config: ProxyConfig, runner_label: str, openstack_connection: OpenstackConnection, -) -> OpenstackRunnerManager: +) -> OpenStackRunnerManager: """Create OpenstackRunnerManager instance. The prefix args of OpenstackRunnerManager set to app_name to let openstack_connection_fixture @@ -122,19 +122,18 @@ async def openstack_runner_manager_fixture( proxy_config=proxy_config, dockerhub_mirror=None, ssh_debug_connections=None, - repo_policy_url=None, - repo_policy_token=None, + repo_policy_compliance=None, ) - return OpenstackRunnerManager( - app_name, cloud_config, server_config, runner_config, service_config + return OpenStackRunnerManager( + app_name, f"{app_name}-0", cloud_config, server_config, runner_config, service_config ) @pytest_asyncio.fixture(scope="module", name="runner_manager") async def runner_manager_fixture( - openstack_runner_manager: OpenstackRunnerManager, + openstack_runner_manager: OpenStackRunnerManager, token: str, - github_path: GithubPath, + github_path: GitHubPath, log_dir_base_path: dict[str, Path], ) -> RunnerManager: """Get RunnerManager instance. @@ -142,7 +141,7 @@ async def runner_manager_fixture( Import of log_dir_base_path to monkeypatch the runner logs path with tmp_path. """ config = RunnerManagerConfig(token, github_path) - return RunnerManager(openstack_runner_manager, config) + return RunnerManager("test_runner", openstack_runner_manager, config) @pytest_asyncio.fixture(scope="function", name="runner_manager_with_one_runner") @@ -219,7 +218,7 @@ async def test_get_no_runner(runner_manager: RunnerManager) -> None: @pytest.mark.asyncio @pytest.mark.abort_on_fail async def test_runner_normal_idle_lifecycle( - runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager + runner_manager: RunnerManager, openstack_runner_manager: OpenStackRunnerManager ) -> None: """ Arrange: RunnerManager instance with no runners. @@ -397,7 +396,7 @@ async def test_runner_normal_lifecycle( @pytest.mark.asyncio @pytest.mark.abort_on_fail async def test_runner_spawn_two( - runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager + runner_manager: RunnerManager, openstack_runner_manager: OpenStackRunnerManager ) -> None: """ Arrange: RunnerManager instance with no runners. diff --git a/tests/integration/test_self_hosted_runner.py b/tests/integration/test_self_hosted_runner.py index c91ac8e97..4232fae4b 100644 --- a/tests/integration/test_self_hosted_runner.py +++ b/tests/integration/test_self_hosted_runner.py @@ -16,7 +16,7 @@ DOCKERHUB_MIRROR_CONFIG_NAME, PATH_CONFIG_NAME, VIRTUAL_MACHINES_CONFIG_NAME, - GithubRepo, + GitHubRepo, ) from github_client import GithubClient from tests.integration.helpers.common import ( @@ -150,7 +150,7 @@ async def test_flush_busy_runner( # Wait until runner online and then busy. for _ in range(30): all_runners = runner_manager_github_client.get_runner_github_info( - GithubRepo( + GitHubRepo( owner=forked_github_repository.owner.login, repo=forked_github_repository.name ) ) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 3ee0259f7..cb50275f6 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -10,7 +10,7 @@ import pytest import utilities -from openstack_cloud import openstack_manager +from manager.runner_scaler import RunnerScaler from tests.unit.mock import MockGhapiClient, MockLxdClient, MockRepoPolicyComplianceClient @@ -46,7 +46,7 @@ def disk_usage_mock(total_disk: int): @pytest.fixture(autouse=True) def mocks(monkeypatch, tmp_path, exec_command, lxd_exec_command, runner_binary_path): - openstack_manager_mock = unittest.mock.MagicMock(spec=openstack_manager) + runner_scaler_mock = unittest.mock.MagicMock(spec=RunnerScaler) cron_path = tmp_path / "cron.d" cron_path.mkdir() @@ -61,7 +61,7 @@ def mocks(monkeypatch, tmp_path, exec_command, lxd_exec_command, runner_binary_p monkeypatch.setattr( "charm.GithubRunnerCharm.repo_check_systemd_service", tmp_path / "systemd_service" ) - monkeypatch.setattr("charm.OpenstackRunnerManager", openstack_manager_mock) + monkeypatch.setattr("charm.RunnerScaler", runner_scaler_mock) monkeypatch.setattr("charm.GithubRunnerCharm.kernel_module_path", tmp_path / "modules") monkeypatch.setattr("charm.GithubRunnerCharm._update_kernel", lambda self, now: None) monkeypatch.setattr("charm.execute_command", exec_command) @@ -86,8 +86,8 @@ def mocks(monkeypatch, tmp_path, exec_command, lxd_exec_command, runner_binary_p monkeypatch.setattr("runner_manager.LxdClient", MockLxdClient) monkeypatch.setattr("runner_manager.shared_fs", unittest.mock.MagicMock()) monkeypatch.setattr("runner_manager.execute_command", exec_command) - monkeypatch.setattr("runner_manager.RunnerManager.runner_bin_path", runner_binary_path) - monkeypatch.setattr("runner_manager.RunnerManager.cron_path", cron_path) + monkeypatch.setattr("runner_manager.LXDRunnerManager.runner_bin_path", runner_binary_path) + monkeypatch.setattr("runner_manager.LXDRunnerManager.cron_path", cron_path) monkeypatch.setattr( "runner_manager.RepoPolicyComplianceClient", MockRepoPolicyComplianceClient ) diff --git a/tests/unit/mock_runner_managers.py b/tests/unit/mock_runner_managers.py new file mode 100644 index 000000000..443c84dfd --- /dev/null +++ b/tests/unit/mock_runner_managers.py @@ -0,0 +1,294 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +import random +import secrets +from dataclasses import dataclass +from typing import Iterable, Iterator, Sequence +from unittest.mock import MagicMock + +from charm_state import GitHubPath +from github_client import GithubClient +from github_type import GitHubRunnerStatus, SelfHostedRunner +from manager.cloud_runner_manager import ( + CloudRunnerInstance, + CloudRunnerManager, + CloudRunnerState, + InstanceId, +) +from manager.github_runner_manager import GitHubRunnerState +from metrics.runner import RunnerMetrics +from tests.unit.mock import MockGhapiClient + + +@dataclass +class MockRunner: + """Mock of a runner. + + Attributes: + name: The name of the runner. + instance_id: The instance id of the runner. + cloud_state: The cloud state of the runner. + github_state: The github state of the runner. + health: The health state of the runner. + """ + + name: str + instance_id: InstanceId + cloud_state: CloudRunnerState + github_state: GitHubRunnerState + health: bool + + def __init__(self, name: str): + """Construct the object. + + Args: + name: The name of the runner. + """ + self.name = name + self.instance_id = secrets.token_hex(6) + self.cloud_state = CloudRunnerState.ACTIVE + self.github_state = GitHubRunnerState.IDLE + self.health = True + + def to_cloud_runner(self) -> CloudRunnerInstance: + """Construct CloudRunnerInstance from this object. + + Returns: + The CloudRunnerInstance instance. + """ + return CloudRunnerInstance( + name=self.name, + instance_id=self.instance_id, + health=self.health, + state=self.cloud_state, + ) + + +@dataclass +class SharedMockRunnerManagerState: + """State shared by mock runner managers. + + For sharing the mock runner states between MockCloudRunnerManager and MockGitHubRunnerManager. + + Attributes: + runners: The runners. + """ + + runners: dict[InstanceId, MockRunner] + + def __init__(self): + """Construct the object.""" + self.runners = {} + + +class MockCloudRunnerManager(CloudRunnerManager): + """Mock of CloudRunnerManager. + + Metrics is not supported in this mock. + + Attributes: + name_prefix: The naming prefix for runners managed. + prefix: The naming prefix for runners managed. + state: The shared state between mocks runner managers. + """ + + def __init__(self, state: SharedMockRunnerManagerState): + """Construct the object. + + Args: + state: The shared state between cloud and github runner managers. + """ + self.prefix = f"mock_{secrets.token_hex(4)}" + self.state = state + + @property + def name_prefix(self) -> str: + """Get the name prefix of the self-hosted runners.""" + return self.prefix + + def create_runner(self, registration_token: str) -> InstanceId: + """Create a self-hosted runner. + + Args: + registration_token: The GitHub registration token for registering runners. + + Returns: + The instance id of the runner created. + """ + name = f"{self.name_prefix}-{secrets.token_hex(6)}" + runner = MockRunner(name) + self.state.runners[runner.instance_id] = runner + return runner.instance_id + + def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: + """Get a self-hosted runner by instance id. + + Args: + instance_id: The instance id. + + Returns: + The runner instance if found else None. + """ + runner = self.state.runners.get(instance_id, None) + if runner is not None: + return runner.to_cloud_runner() + return None + + def get_runners( + self, states: Sequence[CloudRunnerState] | None = None + ) -> tuple[CloudRunnerInstance, ...]: + """Get self-hosted runners by state. + + Args: + states: Filter for the runners with these github states. If None all states will be + included. + + Returns: + The list of runner instances. + """ + if states is None: + states = [member.value for member in CloudRunnerState] + + state_set = set(states) + return tuple( + runner.to_cloud_runner() + for runner in self.state.runners.values() + if runner.cloud_state in state_set + ) + + def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMetrics | None: + """Delete self-hosted runner. + + Args: + instance_id: The instance id of the runner to delete. + remove_token: The GitHub remove token. + + Returns: + Any runner metrics produced during deletion. + """ + runner = self.state.runners.pop(instance_id, None) + if runner is not None: + return iter([MagicMock()]) + return iter([]) + + def flush_runners(self, remove_token: str, busy: bool = False) -> Iterator[RunnerMetrics]: + """Stop all runners. + + Args: + remove_token: The GitHub remove token for removing runners. + busy: If false, only idle runners are removed. If true, both idle and busy runners are + removed. + + Returns: + Any runner metrics produced during flushing. + """ + if busy: + self.state.runners = {} + else: + self.state.runners = { + instance_id: runner + for instance_id, runner in self.state.runners.items() + if runner.github_state == GitHubRunnerState.BUSY + } + return iter([MagicMock()]) + + def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: + """Cleanup runner and resource on the cloud. + + Perform health check on runner and delete the runner if it fails. + + Args: + remove_token: The GitHub remove token for removing runners. + + Returns: + Any runner metrics produced during cleanup. + """ + # Do nothing in mocks. + return iter([MagicMock()]) + + +class MockGitHubRunnerManager: + """Mock of GitHubRunnerManager. + + Attributes: + github: The GitHub client. + name_prefix: The naming prefix for runner managed. + state: The shared state between mock runner managers. + path: The GitHub path to register the runners under. + """ + + def __init__(self, name_prefix: str, path: GitHubPath, state: SharedMockRunnerManagerState): + """Construct the object. + + Args: + name_prefix: The naming prefix for runner managed. + path: The GitHub path to register the runners under. + state: The shared state between mock runner managers. + """ + self.github = GithubClient("mock_token") + self.github._client = MockGhapiClient("mock_token") + self.name_prefix = name_prefix + self.state = state + self.path = path + + def get_registration_token(self) -> str: + """Get the registration token for registering runners on GitHub. + + Returns: + The registration token. + """ + return "mock_registration_token" + + def get_removal_token(self) -> str: + """Get the remove token for removing runners on GitHub. + + Returns: + The remove token. + """ + return "mock_remove_token" + + def get_runners( + self, github_states: Iterable[GitHubRunnerState] | None = None + ) -> tuple[SelfHostedRunner, ...]: + """Get the runners. + + Args: + github_states: The states to filter for. + + Returns: + List of runners. + """ + if github_states is None: + github_states = [member.value for member in GitHubRunnerState] + + github_state_set = set(github_states) + return tuple( + SelfHostedRunner( + busy=runner.github_state == GitHubRunnerState.BUSY, + id=random.randint(1, 1000000), + labels=[], + os="linux", + name=runner.name, + status=( + GitHubRunnerStatus.OFFLINE + if runner.github_state == GitHubRunnerState.OFFLINE + else GitHubRunnerStatus.ONLINE + ), + ) + for runner in self.state.runners.values() + if runner.github_state in github_state_set + ) + + def delete_runners(self, states: Iterable[GitHubRunnerState]) -> None: + """Delete the runners. + + Args: + states: The states to filter the runners to delete. + """ + github_states = set(states) + self.state.runners = { + instance_id: runner + for instance_id, runner in self.state.runners.items() + if runner.github_state not in github_states + } diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 8b19a7797..060bbc96f 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -28,8 +28,8 @@ VM_CPU_CONFIG_NAME, VM_DISK_CONFIG_NAME, Arch, - GithubOrg, - GithubRepo, + GitHubOrg, + GitHubRepo, InstanceType, OpenStackCloudsYAML, OpenstackImage, @@ -49,7 +49,7 @@ from event_timer import EventTimer, TimerEnableError from firewall import FirewallEntry from github_type import GitHubRunnerStatus -from runner_manager import RunnerInfo, RunnerManagerConfig +from runner_manager import LXDRunnerManagerConfig, RunnerInfo TEST_PROXY_SERVER_URL = "http://proxy.server:1234" @@ -158,8 +158,10 @@ def stub_update_runner_bin(*args, **kwargs) -> None: harness = Harness(GithubRunnerCharm) harness.update_config({PATH_CONFIG_NAME: "mock/repo", TOKEN_CONFIG_NAME: "mocktoken"}) harness.begin() - monkeypatch.setattr("runner_manager.RunnerManager.update_runner_bin", stub_update_runner_bin) - monkeypatch.setattr("runner_manager.RunnerManager._runners_in_pre_job", lambda self: False) + monkeypatch.setattr( + "runner_manager.LXDRunnerManager.update_runner_bin", stub_update_runner_bin + ) + monkeypatch.setattr("runner_manager.LXDRunnerManager._runners_in_pre_job", lambda self: False) monkeypatch.setattr("charm.EventTimer.ensure_event_timer", MagicMock()) monkeypatch.setattr("charm.logrotate.setup", MagicMock()) return harness @@ -208,7 +210,7 @@ def test_common_install_code( monkeypatch.setattr("charm.logrotate.setup", setup_logrotate := MagicMock()) monkeypatch.setattr( - "runner_manager.RunnerManager.schedule_build_runner_image", + "runner_manager.LXDRunnerManager.schedule_build_runner_image", schedule_build_runner_image := MagicMock(), ) event_timer_mock = MagicMock(spec=EventTimer) @@ -243,11 +245,11 @@ def test_common_install_code_does_not_rebuild_image( assert: Image is not rebuilt. """ monkeypatch.setattr( - "runner_manager.RunnerManager.build_runner_image", + "runner_manager.LXDRunnerManager.build_runner_image", build_runner_image := MagicMock(), ) monkeypatch.setattr( - "runner_manager.RunnerManager.has_runner_image", + "runner_manager.LXDRunnerManager.has_runner_image", MagicMock(return_value=True), ) getattr(harness.charm.on, hook).emit() @@ -439,7 +441,7 @@ def test_database_integration_events_trigger_reconciliation( class TestCharm(unittest.TestCase): """Test the GithubRunner charm.""" - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -460,8 +462,8 @@ def test_org_register(self, run, wt, mkdir, rm): rm.assert_called_with( "github-runner", "0", - RunnerManagerConfig( - path=GithubOrg(org="mockorg", group="mockgroup"), + LXDRunnerManagerConfig( + path=GitHubOrg(org="mockorg", group="mockgroup"), token="mocktoken", image="jammy", service_token=token, @@ -470,7 +472,7 @@ def test_org_register(self, run, wt, mkdir, rm): ), ) - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -490,8 +492,8 @@ def test_repo_register(self, run, wt, mkdir, rm): rm.assert_called_with( "github-runner", "0", - RunnerManagerConfig( - path=GithubRepo(owner="mockorg", repo="repo"), + LXDRunnerManagerConfig( + path=GitHubRepo(owner="mockorg", repo="repo"), token="mocktoken", image="jammy", service_token=token, @@ -500,7 +502,7 @@ def test_repo_register(self, run, wt, mkdir, rm): ), ) - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -527,7 +529,7 @@ def test_exceed_free_disk_size(self, run, wt, mkdir, rm): ) ) - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -548,8 +550,8 @@ def test_update_config(self, run, wt, mkdir, rm): rm.assert_called_with( "github-runner", "0", - RunnerManagerConfig( - path=GithubRepo(owner="mockorg", repo="repo"), + LXDRunnerManagerConfig( + path=GitHubRepo(owner="mockorg", repo="repo"), token="mocktoken", image="jammy", service_token=token, @@ -570,8 +572,8 @@ def test_update_config(self, run, wt, mkdir, rm): rm.assert_called_with( "github-runner", "0", - RunnerManagerConfig( - path=GithubRepo(owner="mockorg", repo="repo"), + LXDRunnerManagerConfig( + path=GitHubRepo(owner="mockorg", repo="repo"), token="mocktoken", image="jammy", service_token=token, @@ -584,7 +586,7 @@ def test_update_config(self, run, wt, mkdir, rm): ) mock_rm.reset_mock() - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -629,7 +631,7 @@ def test_on_update_status(self, run, wt, mkdir, rm): with pytest.raises(TimerEnableError): harness.charm.on.update_status.emit() - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -641,7 +643,7 @@ def test_on_stop(self, run, wt, mkdir, rm): harness.charm.on.stop.emit() mock_rm.flush.assert_called() - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -660,8 +662,8 @@ def test_on_start_failure(self, run, wt, mkdir, rm): "Failed to start runners: mock error" ) - @patch("charm.RunnerManager") - @patch("charm.OpenstackRunnerManager") + @patch("charm.LXDRunnerManager") + @patch("charm.RunnerScaler") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -700,7 +702,7 @@ def test_on_config_changed_openstack_clouds_yaml(self, run, wt, mkdir, orm, rm): assert harness.charm.unit.status == BlockedStatus("Please provide image integration.") - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -719,7 +721,7 @@ def test_check_runners_action(self, run, wt, mkdir, rm): {"online": 2, "offline": 2, "unknown": 1, "runners": "test runner 0, test runner 1"} ) - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -733,7 +735,7 @@ def test_check_runners_action_with_errors(self, run, wt, mkdir, rm): harness.charm._on_check_runners_action(mock_event) mock_event.fail.assert_called_with("Invalid Github config, Missing path configuration") - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -936,7 +938,7 @@ def test__on_image_relation_image_ready(): harness.charm._setup_state = MagicMock(return_value=state_mock) harness.charm._get_set_image_ready_status = MagicMock(return_value=True) runner_manager_mock = MagicMock() - harness.charm._get_openstack_runner_manager = MagicMock(return_value=runner_manager_mock) + harness.charm._get_runner_scaler = MagicMock(return_value=runner_manager_mock) harness.charm._on_image_relation_changed(MagicMock()) diff --git a/tests/unit/test_charm_state.py b/tests/unit/test_charm_state.py index 8479782a8..d8fdd896d 100644 --- a/tests/unit/test_charm_state.py +++ b/tests/unit/test_charm_state.py @@ -41,8 +41,8 @@ CharmState, FirewallEntry, GithubConfig, - GithubOrg, - GithubRepo, + GitHubOrg, + GitHubRepo, ImmutableConfigChangedError, LocalLxdRunnerConfig, OpenstackImage, @@ -65,7 +65,7 @@ def test_github_repo_path(): """ owner = "test_owner" repo = "test_repo" - github_repo = GithubRepo(owner, repo) + github_repo = GitHubRepo(owner, repo) path = github_repo.path() @@ -80,7 +80,7 @@ def test_github_org_path(): """ org = "test_org" group = "test_group" - github_org = GithubOrg(org, group) + github_org = GitHubOrg(org, group) path = github_org.path() @@ -129,14 +129,14 @@ def test_github_config_from_charm_invalid_token(): @pytest.mark.parametrize( "path_str, runner_group, expected_type, expected_attrs", [ - ("owner/repo", "test_group", GithubRepo, {"owner": "owner", "repo": "repo"}), - ("test_org", "test_group", GithubOrg, {"org": "test_org", "group": "test_group"}), + ("owner/repo", "test_group", GitHubRepo, {"owner": "owner", "repo": "repo"}), + ("test_org", "test_group", GitHubOrg, {"org": "test_org", "group": "test_group"}), ], ) def test_parse_github_path( path_str: str, runner_group: str, - expected_type: GithubRepo | GithubOrg, + expected_type: GitHubRepo | GitHubOrg, expected_attrs: dict[str, str], ): """ @@ -498,7 +498,7 @@ def test_charm_config_from_charm_valid(): result = CharmConfig.from_charm(mock_charm) - assert result.path == GithubRepo(owner="owner", repo="repo") + assert result.path == GitHubRepo(owner="owner", repo="repo") assert result.reconcile_interval == 5 assert result.denylist == [ FirewallEntry(ip_range="192.168.1.1"), diff --git a/tests/unit/test_github_client.py b/tests/unit/test_github_client.py index b01a75a01..9bd336a03 100644 --- a/tests/unit/test_github_client.py +++ b/tests/unit/test_github_client.py @@ -10,7 +10,7 @@ import pytest -from charm_state import GithubRepo +from charm_state import GitHubRepo from errors import JobNotFoundError from github_client import GithubClient from github_type import JobConclusion, JobStats @@ -95,7 +95,7 @@ def test_get_job_info(github_client: GithubClient, job_stats_raw: JobStatsRawDat act: Call get_job_info. assert: The correct JobStats object is returned. """ - github_repo = GithubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) + github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) job_stats = github_client.get_job_info( path=github_repo, workflow_run_id=secrets.token_hex(16), @@ -128,7 +128,7 @@ def test_get_job_info_no_conclusion(github_client: GithubClient, job_stats_raw: } ] } - github_repo = GithubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) + github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) job_stats = github_client.get_job_info( path=github_repo, workflow_run_id=secrets.token_hex(16), @@ -156,7 +156,7 @@ def test_github_api_pagination_multiple_pages( github_client=github_client, job_stats_raw=job_stats_raw, include_runner=True ) - github_repo = GithubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) + github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) job_stats = github_client.get_job_info( path=github_repo, workflow_run_id=secrets.token_hex(16), @@ -184,7 +184,7 @@ def test_github_api_pagination_job_not_found( github_client=github_client, job_stats_raw=job_stats_raw, include_runner=False ) - github_repo = GithubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) + github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) with pytest.raises(JobNotFoundError): github_client.get_job_info( @@ -198,7 +198,7 @@ def test_github_api_http_error(github_client: GithubClient, job_stats_raw: JobSt github_client._client.actions.list_jobs_for_workflow_run.side_effect = HTTPError( "http://test.com", 500, "", http.client.HTTPMessage(), None ) - github_repo = GithubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) + github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) with pytest.raises(JobNotFoundError): github_client.get_job_info( diff --git a/tests/unit/test_runner_manager.py b/tests/unit/test_lxd_runner_manager.py similarity index 91% rename from tests/unit/test_runner_manager.py rename to tests/unit/test_lxd_runner_manager.py index 66b09cd60..36c36df11 100644 --- a/tests/unit/test_runner_manager.py +++ b/tests/unit/test_lxd_runner_manager.py @@ -1,7 +1,7 @@ # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. -"""Test cases of RunnerManager class.""" +"""Test cases of LXDRunnerManager class.""" import random import secrets from pathlib import Path @@ -16,8 +16,8 @@ Arch, CharmConfig, CharmState, - GithubOrg, - GithubRepo, + GitHubOrg, + GitHubRepo, ProxyConfig, ReactiveConfig, VirtualMachineResources, @@ -28,7 +28,7 @@ from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME from metrics.storage import MetricsStorage from runner import Runner, RunnerStatus -from runner_manager import BUILD_IMAGE_SCRIPT_FILENAME, RunnerManager, RunnerManagerConfig +from runner_manager import BUILD_IMAGE_SCRIPT_FILENAME, LXDRunnerManager, LXDRunnerManagerConfig from runner_type import RunnerNameByHealth from tests.unit.mock import TEST_BINARY, MockLxdImageManager @@ -67,9 +67,9 @@ def charm_state_fixture(charm_config: MagicMock): scope="function", name="runner_manager", params=[ - (GithubOrg("test_org", "test_group"), ProxyConfig()), + (GitHubOrg("test_org", "test_group"), ProxyConfig()), ( - GithubRepo("test_owner", "test_repo"), + GitHubRepo("test_owner", "test_repo"), ProxyConfig( no_proxy="test_no_proxy", http=TEST_PROXY_SERVER_URL, @@ -82,15 +82,15 @@ def charm_state_fixture(charm_config: MagicMock): def runner_manager_fixture(request, tmp_path, monkeypatch, token, charm_state): charm_state.proxy_config = request.param[1] monkeypatch.setattr( - "runner_manager.RunnerManager.runner_bin_path", tmp_path / "mock_runner_binary" + "runner_manager.LXDRunnerManager.runner_bin_path", tmp_path / "mock_runner_binary" ) pool_path = tmp_path / "test_storage" pool_path.mkdir(exist_ok=True) - runner_manager = RunnerManager( + runner_manager = LXDRunnerManager( "test app", "0", - RunnerManagerConfig( + LXDRunnerManagerConfig( path=request.param[0], token=token, image=IMAGE_NAME, @@ -144,7 +144,7 @@ def reactive_reconcile_fixture(monkeypatch: MonkeyPatch, tmp_path: Path) -> Magi pytest.param(Arch.X64), ], ) -def test_get_latest_runner_bin_url(runner_manager: RunnerManager, arch: Arch, charm_state): +def test_get_latest_runner_bin_url(runner_manager: LXDRunnerManager, arch: Arch, charm_state): """ arrange: Nothing. act: Get runner bin url of existing binary. @@ -168,7 +168,7 @@ def test_get_latest_runner_bin_url(runner_manager: RunnerManager, arch: Arch, ch assert runner_bin["filename"] == filename -def test_get_latest_runner_bin_url_missing_binary(runner_manager: RunnerManager): +def test_get_latest_runner_bin_url_missing_binary(runner_manager: LXDRunnerManager): """ arrange: Given a mocked GH API client that does not return any runner binaries. act: Get runner bin url of non-existing binary. @@ -181,7 +181,7 @@ def test_get_latest_runner_bin_url_missing_binary(runner_manager: RunnerManager) runner_manager.get_latest_runner_bin_url(os_name="not_exist") -def test_update_runner_bin(runner_manager: RunnerManager): +def test_update_runner_bin(runner_manager: LXDRunnerManager): """ arrange: Remove the existing runner binary. act: Update runner binary. @@ -222,7 +222,7 @@ def iter_content(self, *args, **kwargs): assert runner_manager.runner_bin_path.read_bytes() == TEST_BINARY -def test_reconcile_zero_count(runner_manager: RunnerManager): +def test_reconcile_zero_count(runner_manager: LXDRunnerManager): """ arrange: Nothing. act: Reconcile with the current amount of runner. @@ -234,7 +234,7 @@ def test_reconcile_zero_count(runner_manager: RunnerManager): assert delta == 0 -def test_reconcile_create_runner(runner_manager: RunnerManager): +def test_reconcile_create_runner(runner_manager: LXDRunnerManager): """ arrange: Nothing. act: Reconcile to create a runner. @@ -246,7 +246,7 @@ def test_reconcile_create_runner(runner_manager: RunnerManager): assert delta == 1 -def test_reconcile_remove_runner(runner_manager: RunnerManager): +def test_reconcile_remove_runner(runner_manager: LXDRunnerManager): """ arrange: Create online runners. act: Reconcile to remove a runner. @@ -282,7 +282,7 @@ def mock_get_runners(): assert delta == -1 -def test_reconcile(runner_manager: RunnerManager, tmp_path: Path): +def test_reconcile(runner_manager: LXDRunnerManager, tmp_path: Path): """ arrange: Setup one runner. act: Reconcile with the current amount of runner. @@ -295,7 +295,7 @@ def test_reconcile(runner_manager: RunnerManager, tmp_path: Path): assert len(runner_manager._get_runners()) == 1 -def test_empty_flush(runner_manager: RunnerManager): +def test_empty_flush(runner_manager: LXDRunnerManager): """ arrange: No initial runners. act: Perform flushing with no runners. @@ -305,7 +305,7 @@ def test_empty_flush(runner_manager: RunnerManager): runner_manager.flush() -def test_flush(runner_manager: RunnerManager, tmp_path: Path): +def test_flush(runner_manager: LXDRunnerManager, tmp_path: Path): """ arrange: Create some runners. act: Perform flushing. @@ -319,7 +319,7 @@ def test_flush(runner_manager: RunnerManager, tmp_path: Path): def test_reconcile_issues_runner_installed_event( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, monkeypatch: MonkeyPatch, issue_event_mock: MagicMock, charm_state: MagicMock, @@ -341,7 +341,7 @@ def test_reconcile_issues_runner_installed_event( def test_reconcile_issues_no_runner_installed_event_if_metrics_disabled( - runner_manager: RunnerManager, issue_event_mock: MagicMock, charm_state: MagicMock + runner_manager: LXDRunnerManager, issue_event_mock: MagicMock, charm_state: MagicMock ): """ arrange: Disable issuing of metrics. @@ -356,7 +356,7 @@ def test_reconcile_issues_no_runner_installed_event_if_metrics_disabled( def test_reconcile_error_on_issue_event_is_ignored( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, issue_event_mock: MagicMock, charm_state: MagicMock, ): @@ -375,7 +375,7 @@ def test_reconcile_error_on_issue_event_is_ignored( def test_reconcile_issues_reconciliation_metric_event( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, monkeypatch: MonkeyPatch, issue_event_mock: MagicMock, runner_metrics: MagicMock, @@ -458,7 +458,7 @@ def mock_get_runners(): def test_reconcile_places_timestamp_in_newly_created_runner( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, monkeypatch: MonkeyPatch, shared_fs: MagicMock, tmp_path: Path, @@ -485,7 +485,7 @@ def test_reconcile_places_timestamp_in_newly_created_runner( def test_reconcile_error_on_placing_timestamp_is_ignored( - runner_manager: RunnerManager, shared_fs: MagicMock, tmp_path: Path, charm_state: MagicMock + runner_manager: LXDRunnerManager, shared_fs: MagicMock, tmp_path: Path, charm_state: MagicMock ): """ arrange: Enable issuing of metrics and do not create the directory for the shared filesystem\ @@ -504,7 +504,7 @@ def test_reconcile_error_on_placing_timestamp_is_ignored( def test_reconcile_places_no_timestamp_in_newly_created_runner_if_metrics_disabled( - runner_manager: RunnerManager, shared_fs: MagicMock, tmp_path: Path, charm_state: MagicMock + runner_manager: LXDRunnerManager, shared_fs: MagicMock, tmp_path: Path, charm_state: MagicMock ): """ arrange: Disable issuing of metrics, mock timestamps and the shared filesystem module. @@ -522,7 +522,7 @@ def test_reconcile_places_no_timestamp_in_newly_created_runner_if_metrics_disabl def test_reconcile_reactive_mode( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, reactive_reconcile_mock: MagicMock, caplog: LogCaptureFixture, ): @@ -542,7 +542,7 @@ def test_reconcile_reactive_mode( def test_schedule_build_runner_image( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, tmp_path: Path, charm_state: CharmState, monkeypatch: MonkeyPatch, @@ -569,7 +569,7 @@ def test_schedule_build_runner_image( assert cronfile.read_text() == f"4 4,10,16,22 * * * ubuntu {cmd} jammy\n" -def test_has_runner_image(runner_manager: RunnerManager): +def test_has_runner_image(runner_manager: LXDRunnerManager): """ arrange: Multiple setups. 1. no runner image exists. diff --git a/tests/unit/test_openstack_manager.py b/tests/unit/test_openstack_manager.py deleted file mode 100644 index 5329b1282..000000000 --- a/tests/unit/test_openstack_manager.py +++ /dev/null @@ -1,1200 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import random -import secrets -from pathlib import Path -from typing import Optional -from unittest.mock import MagicMock, call - -import jinja2 -import openstack.connection -import openstack.exceptions -import pytest -from fabric.connection import Connection as SSHConnection -from invoke import Result -from openstack.compute.v2.keypair import Keypair -from openstack.compute.v2.server import Server -from pytest import LogCaptureFixture, MonkeyPatch - -import metrics.storage -import reactive.runner_manager -from charm_state import CharmState, ProxyConfig, ReactiveConfig, RepoPolicyComplianceConfig -from errors import OpenStackError, RunnerStartError -from github_type import GitHubRunnerStatus, RunnerApplication, SelfHostedRunner -from metrics import events as metric_events -from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME -from metrics.storage import MetricsStorage -from openstack_cloud import openstack_manager -from openstack_cloud.openstack_manager import MAX_METRICS_FILE_SIZE, METRICS_EXCHANGE_PATH -from runner_manager_type import FlushMode -from runner_type import RunnerGithubInfo, RunnerNameByHealth -from tests.unit import factories - -FAKE_MONGODB_URI = "mongodb://example.com/db" -CLOUD_NAME = "microstack" - - -@pytest.fixture(autouse=True, name="openstack_connect_mock") -def mock_openstack_connect_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: - """Mock openstack.connect.""" - mock_connect = MagicMock(spec=openstack_manager.openstack.connect) - monkeypatch.setattr("openstack_cloud.openstack_manager.openstack.connect", mock_connect) - return mock_connect - - -@pytest.fixture(name="mock_server") -def mock_server_fixture() -> MagicMock: - """Mock OpenStack Server object.""" - mock_server = MagicMock(spec=Server) - mock_server.key_name = "mock_key" - mock_server.addresses.values = MagicMock(return_value=[[{"addr": "10.0.0.1"}]]) - return mock_server - - -@pytest.fixture(name="patch_get_ssh_connection_health_check") -def patch_get_ssh_connection_health_check_fixture(monkeypatch: pytest.MonkeyPatch): - """Patch SSH connection to a MagicMock instance for get_ssh_connection health check.""" - mock_get_ssh_connection = MagicMock( - spec=openstack_manager.OpenstackRunnerManager._get_ssh_connection - ) - mock_ssh_connection = MagicMock(spec=SSHConnection) - mock_ssh_connection.host = "test host IP" - mock_result = MagicMock(spec=Result) - mock_result.ok = True - mock_result.stderr = "" - mock_result.stdout = "hello world" - mock_ssh_connection.run.return_value = mock_result - mock_get_ssh_connection.return_value = [mock_ssh_connection] - - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - mock_get_ssh_connection, - ) - - -@pytest.fixture(name="ssh_connection_health_check") -def ssh_connection_health_check_fixture(monkeypatch: pytest.MonkeyPatch): - """SSH connection to a MagicMock instance for health check.""" - mock_get_ssh_connection = MagicMock( - spec=openstack_manager.OpenstackRunnerManager._get_ssh_connection - ) - mock_ssh_connection = MagicMock(spec=SSHConnection) - mock_ssh_connection.host = "test host IP" - mock_result = MagicMock(spec=Result) - mock_result.ok = True - mock_result.stderr = "" - mock_result.stdout = "-- Test output: /bin/bash /home/ubuntu/actions-runner/run.sh --" - mock_ssh_connection.run.return_value = mock_result - mock_get_ssh_connection.return_value = mock_ssh_connection - - return mock_get_ssh_connection - - -@pytest.fixture(name="patch_ssh_connection_error") -def patch_ssh_connection_error_fixture(monkeypatch: pytest.MonkeyPatch): - """Patch SSH connection to a MagicMock instance with error on run.""" - mock_get_ssh_connection = MagicMock( - spec=openstack_manager.OpenstackRunnerManager._get_ssh_connection - ) - mock_ssh_connection = MagicMock(spec=SSHConnection) - mock_result = MagicMock(spec=Result) - mock_result.ok = False - mock_result.stdout = "Mock stdout" - mock_result.stderr = "Mock stderr" - mock_ssh_connection.run.return_value = mock_result - mock_get_ssh_connection.return_value = mock_ssh_connection - - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - mock_get_ssh_connection, - ) - - -@pytest.fixture(name="mock_github_client") -def mock_github_client_fixture() -> MagicMock: - """Mocked github client that returns runner application.""" - mock_github_client = MagicMock(spec=openstack_manager.GithubClient) - mock_github_client.get_runner_application.return_value = RunnerApplication( - os="linux", - architecture="x64", - download_url="http://test_url", - filename="test_filename", - temp_download_token="test_token", - ) - mock_github_client.get_runner_registration_token.return_value = "test_token" - return mock_github_client - - -@pytest.fixture(name="patch_execute_command") -def patch_execute_command_fixture(monkeypatch: pytest.MonkeyPatch): - """Patch execute command to a MagicMock instance.""" - monkeypatch.setattr( - openstack_manager, - "execute_command", - MagicMock(spec=openstack_manager.execute_command), - ) - - -@pytest.fixture(name="patched_create_connection_context") -def patched_create_connection_context_fixture(monkeypatch: pytest.MonkeyPatch): - """Return a mocked openstack connection context manager and patch create_connection.""" - mock_connection = MagicMock(spec=openstack_manager.openstack.connection.Connection) - monkeypatch.setattr( - openstack_manager, - "_create_connection", - MagicMock(spec=openstack_manager._create_connection, return_value=mock_connection), - ) - return mock_connection.__enter__() - - -@pytest.fixture(name="ssh_connection_mock") -def ssh_connection_mock_fixture() -> MagicMock: - """Return a mocked ssh connection.""" - test_file_content = secrets.token_hex(16) - ssh_conn_mock = MagicMock(spec=openstack_manager.SSHConnection) - ssh_conn_mock.get.side_effect = lambda remote, local: Path(local).write_text(test_file_content) - ssh_conn_mock.run.side_effect = lambda cmd, **kwargs: ( - Result(stdout="1") if cmd.startswith("stat") else Result() - ) - ssh_conn_mock.run.return_value = Result() - - return ssh_conn_mock - - -@pytest.fixture(name="openstack_manager_for_reconcile") -def openstack_manager_for_reconcile_fixture( - monkeypatch: pytest.MonkeyPatch, - mock_github_client: MagicMock, - patched_create_connection_context: MagicMock, - tmp_path: Path, - ssh_connection_mock: MagicMock, -): - """Create a mocked openstack manager for the reconcile tests.""" - t_mock = MagicMock(return_value=12345) - monkeypatch.setattr(openstack_manager.time, "time", t_mock) - - issue_event_mock = MagicMock(spec=metric_events.issue_event) - monkeypatch.setattr(openstack_manager.metric_events, "issue_event", issue_event_mock) - - runner_metrics_mock = MagicMock(openstack_manager.runner_metrics) - monkeypatch.setattr(openstack_manager, "runner_metrics", runner_metrics_mock) - - github_metrics_mock = MagicMock(openstack_manager.github_metrics) - monkeypatch.setattr(openstack_manager, "github_metrics", github_metrics_mock) - - monkeypatch.setattr( - openstack_manager, "GithubClient", MagicMock(return_value=mock_github_client) - ) - - runner_metrics_path = tmp_path / "runner_fs" - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) - - pool_mock = MagicMock() - pool_mock.__enter__.return_value = pool_mock - pool_mock.map.side_effect = lambda func, iterable: func(*iterable) - pool_cls_mock = MagicMock() - pool_cls_mock.return_value = pool_mock - monkeypatch.setattr(openstack_manager, "Pool", pool_cls_mock) - - app_name = secrets.token_hex(16) - charm_state = MagicMock(spec=CharmState) - charm_state.proxy_config = ProxyConfig() - charm_state.ssh_debug_connections = MagicMock() - charm_state.charm_config = MagicMock() - charm_state.charm_config.repo_policy_compliance = None - os_runner_manager_config = openstack_manager.OpenstackRunnerManagerConfig( - charm_state=charm_state, - path=MagicMock(), - labels=[], - token=secrets.token_hex(16), - flavor=app_name, - image="test-image-id", - network=secrets.token_hex(16), - dockerhub_mirror=None, - ) - patched_create_connection_context.create_keypair.return_value = Keypair(private_key="test_key") - server_mock = MagicMock() - server_mock.status = openstack_manager._INSTANCE_STATUS_ACTIVE - patched_create_connection_context.get_server.return_value = server_mock - - os_runner_manager = openstack_manager.OpenstackRunnerManager( - app_name=app_name, - unit_num=0, - openstack_runner_manager_config=os_runner_manager_config, - cloud_config={}, - ) - os_runner_manager._ssh_health_check = MagicMock(return_value=True) - os_runner_manager._get_ssh_connection = MagicMock(return_value=ssh_connection_mock) - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, "_wait_until_runner_process_running", MagicMock() - ) - - monkeypatch.setattr(openstack_manager, "_SSH_KEY_PATH", tmp_path) - monkeypatch.setattr(openstack_manager.shutil, "chown", MagicMock()) - - return os_runner_manager - - -@pytest.fixture(name="reactive_reconcile_mock") -def reactive_reconcile_fixture(monkeypatch: MonkeyPatch, tmp_path: Path) -> MagicMock: - """Mock the job class.""" - reconcile_mock = MagicMock(spec=reactive.runner_manager.reconcile) - monkeypatch.setattr( - "openstack_cloud.openstack_manager.reactive_runner_manager.reconcile", reconcile_mock - ) - reconcile_mock.side_effect = lambda quantity, **kwargs: quantity - return reconcile_mock - - -def test__create_connection_error(clouds_yaml: dict, openstack_connect_mock: MagicMock): - """ - arrange: given a monkeypatched connection.authorize() function that raises an error. - act: when _create_connection is called. - assert: OpenStackUnauthorizedError is raised. - """ - connection_mock = MagicMock() - connection_context = MagicMock() - connection_context.authorize.side_effect = openstack.exceptions.HttpException - connection_mock.__enter__.return_value = connection_context - openstack_connect_mock.return_value = connection_mock - - with pytest.raises(OpenStackError) as exc: - with openstack_manager._create_connection(cloud_config=clouds_yaml): - pass - - assert "Failed OpenStack API call" in str(exc) - - -def test__create_connection( - multi_clouds_yaml: dict, clouds_yaml: dict, cloud_name: str, openstack_connect_mock: MagicMock -): - """ - arrange: given a cloud config yaml dict with 1. multiple clouds 2. single cloud. - act: when _create_connection is called. - assert: connection with first cloud in the config is used. - """ - # 1. multiple clouds - with openstack_manager._create_connection(cloud_config=multi_clouds_yaml): - openstack_connect_mock.assert_called_with(cloud=CLOUD_NAME) - - # 2. single cloud - with openstack_manager._create_connection(cloud_config=clouds_yaml): - openstack_connect_mock.assert_called_with(cloud=cloud_name) - - -@pytest.mark.parametrize( - "dockerhub_mirror, ssh_debug_connections, expected_env_contents", - [ - pytest.param( - None, - None, - """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - -LANG=C.UTF-8 -ACTIONS_RUNNER_HOOK_JOB_STARTED=/home/ubuntu/actions-runner/pre-job.sh -""", - id="all values empty", - ), - pytest.param( - "http://dockerhub_mirror.test", - None, - """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - -DOCKERHUB_MIRROR=http://dockerhub_mirror.test -CONTAINER_REGISTRY_URL=http://dockerhub_mirror.test - -LANG=C.UTF-8 -ACTIONS_RUNNER_HOOK_JOB_STARTED=/home/ubuntu/actions-runner/pre-job.sh -""", - id="dockerhub mirror set", - ), - pytest.param( - None, - [ - openstack_manager.SSHDebugConnection( - host="127.0.0.1", - port=10022, - rsa_fingerprint="SHA256:testrsa", - ed25519_fingerprint="SHA256:tested25519", - ) - ], - """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - -LANG=C.UTF-8 -ACTIONS_RUNNER_HOOK_JOB_STARTED=/home/ubuntu/actions-runner/pre-job.sh - -TMATE_SERVER_HOST=127.0.0.1 -TMATE_SERVER_PORT=10022 -TMATE_SERVER_RSA_FINGERPRINT=SHA256:testrsa -TMATE_SERVER_ED25519_FINGERPRINT=SHA256:tested25519 -""", - id="ssh debug connection set", - ), - pytest.param( - "http://dockerhub_mirror.test", - [ - openstack_manager.SSHDebugConnection( - host="127.0.0.1", - port=10022, - rsa_fingerprint="SHA256:testrsa", - ed25519_fingerprint="SHA256:tested25519", - ) - ], - """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - -DOCKERHUB_MIRROR=http://dockerhub_mirror.test -CONTAINER_REGISTRY_URL=http://dockerhub_mirror.test - -LANG=C.UTF-8 -ACTIONS_RUNNER_HOOK_JOB_STARTED=/home/ubuntu/actions-runner/pre-job.sh - -TMATE_SERVER_HOST=127.0.0.1 -TMATE_SERVER_PORT=10022 -TMATE_SERVER_RSA_FINGERPRINT=SHA256:testrsa -TMATE_SERVER_ED25519_FINGERPRINT=SHA256:tested25519 -""", - id="all values set", - ), - ], -) -def test__generate_runner_env( - dockerhub_mirror: Optional[str], - ssh_debug_connections: Optional[list[openstack_manager.SSHDebugConnection]], - expected_env_contents: str, -): - """ - arrange: given configuration values for runner environment. - act: when _generate_runner_env is called. - assert: expected .env contents are generated. - """ - environment = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"), autoescape=True) - assert ( - openstack_manager._generate_runner_env( - templates_env=environment, - dockerhub_mirror=dockerhub_mirror, - ssh_debug_connections=ssh_debug_connections, - ) - == expected_env_contents - ) - - -def test_reconcile_issues_runner_installed_event( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, -): - """ - arrange: Mock openstack manager for reconcile. - act: Reconcile to create a runner. - assert: The expected event is issued. - """ - openstack_manager_for_reconcile.reconcile(quantity=1) - - openstack_manager.metric_events.issue_event.assert_has_calls( - [ - call( - event=metric_events.RunnerInstalled( - timestamp=openstack_manager.time.time(), - flavor=openstack_manager_for_reconcile.app_name, - duration=0, - ) - ) - ] - ) - - -def test_reconcile_places_timestamp_in_metrics_storage( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -): - """ - arrange: Mock timestamps and create the directory for the metrics storage. - act: Reconcile to create a runner. - assert: The expected timestamp is placed in the shared filesystem. - """ - runner_metrics_path = tmp_path / "runner_fs" - runner_metrics_path.mkdir() - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - - openstack_manager_for_reconcile.reconcile(quantity=1) - - assert (ms.path / RUNNER_INSTALLED_TS_FILE_NAME).exists() - assert (ms.path / RUNNER_INSTALLED_TS_FILE_NAME).read_text() == str( - openstack_manager.time.time() - ) - - -def test_reconcile_error_on_placing_timestamp_is_ignored( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -): - """ - arrange: Do not create the directory for the metrics storage\ - in order to let a FileNotFoundError to be raised inside the OpenstackRunnerManager. - act: Reconcile to create a runner. - assert: No exception is raised. - """ - runner_metrics_path = tmp_path / "runner_fs" - - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - - openstack_manager_for_reconcile.reconcile(quantity=1) - - assert not (ms.path / RUNNER_INSTALLED_TS_FILE_NAME).exists() - - -def test_reconcile_pulls_metric_files( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, - ssh_connection_mock: MagicMock, -): - """ - arrange: Mock the metrics storage and the ssh connection. - act: Reconcile to create a runner. - assert: The expected metric files are pulled from the shared filesystem. - """ - runner_metrics_path = tmp_path / "runner_fs" - runner_metrics_path.mkdir() - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) - openstack_manager_for_reconcile._get_openstack_runner_status = MagicMock( - return_value=RunnerNameByHealth(healthy=(), unhealthy=("test_runner",)) - ) - ssh_connection_mock.get.side_effect = MagicMock() - openstack_manager_for_reconcile.reconcile(quantity=0) - - ssh_connection_mock.get.assert_any_call( - remote=str(METRICS_EXCHANGE_PATH / "pre-job-metrics.json"), - local=str(ms.path / "pre-job-metrics.json"), - ) - ssh_connection_mock.get.assert_any_call( - remote=str(METRICS_EXCHANGE_PATH / "post-job-metrics.json"), - local=str(ms.path / "post-job-metrics.json"), - ) - - -def test_reconcile_does_not_pull_too_large_files( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, - ssh_connection_mock: MagicMock, -): - """ - arrange: Mock the metrics storage and the ssh connection to return a file that is too large. - act: Reconcile to create a runner. - assert: The expected metric files are not pulled from the shared filesystem. - """ - runner_metrics_path = tmp_path / "runner_fs" - runner_metrics_path.mkdir() - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) - ssh_connection_mock.run.side_effect = lambda cmd, **kwargs: ( - Result(stdout=f"{MAX_METRICS_FILE_SIZE + 1}") if cmd.startswith("stat") else Result() - ) - openstack_manager_for_reconcile._get_openstack_runner_status = MagicMock( - return_value=RunnerNameByHealth(healthy=("test_runner",), unhealthy=()) - ) - - openstack_manager_for_reconcile.reconcile(quantity=0) - - assert not (ms.path / "pre-job-metrics.json").exists() - assert not (ms.path / "post-job-metrics.json").exists() - - -def test_reconcile_issue_reconciliation_metrics( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -): - """ - arrange: Mock the metrics storage and the ssh connection. - act: Reconcile. - assert: The expected reconciliation metrics are issued. - """ - runner_metrics_path = tmp_path / "runner_fs" - runner_metrics_path.mkdir() - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) - openstack_manager_for_reconcile._get_openstack_runner_status = MagicMock( - return_value=RunnerNameByHealth(healthy=("test_runner",), unhealthy=()) - ) - - openstack_manager.runner_metrics.extract.return_value = (MagicMock() for _ in range(2)) - openstack_manager.runner_metrics.issue_events.side_effect = [ - {metric_events.RunnerStart, metric_events.RunnerStop}, - {metric_events.RunnerStart}, - ] - - openstack_manager_for_reconcile._github.get_runner_github_info.return_value = [ - SelfHostedRunner( - busy=False, - id=1, - labels=[], - os="linux", - name=f"{openstack_manager_for_reconcile.instance_name}-test_runner", - status=GitHubRunnerStatus.ONLINE, - ) - ] - openstack_manager_for_reconcile.reconcile(quantity=0) - - openstack_manager.metric_events.issue_event.assert_has_calls( - [ - call( - event=metric_events.Reconciliation( - timestamp=12345, - flavor=openstack_manager_for_reconcile.app_name, - crashed_runners=1, - idle_runners=1, - duration=0, - ) - ) - ] - ) - - -def test_reconcile_ignores_metrics_for_openstack_online_runners( - openstack_manager_for_reconcile, - monkeypatch, - tmp_path, - patched_create_connection_context: MagicMock, -): - """ - arrange: Combination of runner status/github status and openstack status. - act: Call reconcile. - assert: All runners which have an instance on Openstack are ignored for metrics extraction. - """ - runner_metrics_path = tmp_path / "runner_fs" - runner_metrics_path.mkdir() - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) - instance_name = openstack_manager_for_reconcile.instance_name - runner_names = { - k: f"{instance_name}-{k}" - for k in [ - "healthy_online", - "healthy_offline", - "unhealthy_online", - "unhealthy_offline", - "openstack_online_no_github_status", - "github_online_no_openstack_status", - ] - } - openstack_manager_for_reconcile._get_openstack_runner_status = MagicMock( - return_value=RunnerNameByHealth( - healthy=(runner_names["healthy_online"], runner_names["healthy_offline"]), - unhealthy=( - runner_names["unhealthy_online"], - runner_names["unhealthy_offline"], - runner_names["github_online_no_openstack_status"], - ), - ) - ) - openstack_manager_for_reconcile.get_github_runner_info = MagicMock( - return_value=( - RunnerGithubInfo( - runner_name=runner_names["healthy_online"], runner_id=0, online=True, busy=True - ), - RunnerGithubInfo( - runner_name=runner_names["unhealthy_online"], runner_id=1, online=True, busy=False - ), - RunnerGithubInfo( - runner_name=runner_names["healthy_offline"], runner_id=2, online=False, busy=False - ), - RunnerGithubInfo( - runner_name=runner_names["unhealthy_offline"], - runner_id=3, - online=False, - busy=False, - ), - RunnerGithubInfo( - runner_name=runner_names["github_online_no_openstack_status"], - runner_id=4, - online=True, - busy=False, - ), - ) - ) - - openstack_online_runner_names = [ - runner - for (name, runner) in runner_names.items() - if name != "github_online_no_openstack_status" - ] - openstack_instances = [ - openstack_manager.openstack.compute.v2.server.Server( - name=runner_name, status=random.choice(("ACTIVE", "BUILD", "STOPPED")) - ) - for runner_name in openstack_online_runner_names - ] - patched_create_connection_context.list_servers.return_value = openstack_instances - - openstack_manager.runner_metrics.extract.return_value = (MagicMock() for _ in range(1)) - openstack_manager.runner_metrics.issue_events.side_effect = [ - {metric_events.RunnerStart, metric_events.RunnerStop}, - ] - - openstack_manager_for_reconcile.reconcile(quantity=0) - - openstack_manager.runner_metrics.extract.assert_called_once_with( - metrics_storage_manager=metrics.storage, - runners=set(openstack_online_runner_names), - ) - - -def test_reconcile_reactive_mode( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - reactive_reconcile_mock: MagicMock, - caplog: LogCaptureFixture, -): - """ - arrange: Enable reactive mode and mock the job class to return a job. - act: Call reconcile with a random quantity n. - assert: The mocked job is picked up n times and the expected log message is present. - """ - count = random.randint(0, 5) - openstack_manager_for_reconcile._config.reactive_config = ReactiveConfig( - mq_uri=FAKE_MONGODB_URI - ) - actual_count = openstack_manager_for_reconcile.reconcile(quantity=count) - - assert actual_count == count - reactive_reconcile_mock.assert_called_with( - quantity=count, - mq_uri=FAKE_MONGODB_URI, - queue_name=openstack_manager_for_reconcile.app_name, - ) - - -def test_repo_policy_config( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - patched_create_connection_context: MagicMock, -): - """ - arrange: Mock the repo policy compliance config. - act: Reconcile to create a runner. - assert: The expected url and one-time-token is present in the pre-job script in \ - the cloud-init data. - """ - test_url = "http://test.url" - token = secrets.token_hex(16) - one_time_token = secrets.token_hex(16) - openstack_manager_for_reconcile._config.charm_state.charm_config.repo_policy_compliance = ( - RepoPolicyComplianceConfig(url=test_url, token=token) - ) - repo_policy_compliance_client_mock = MagicMock( - spec=openstack_manager.RepoPolicyComplianceClient - ) - repo_policy_compliance_client_mock.base_url = test_url - repo_policy_compliance_client_mock.get_one_time_token.return_value = one_time_token - repo_policy_compliance_cls_mock = MagicMock(return_value=repo_policy_compliance_client_mock) - monkeypatch.setattr( - openstack_manager, "RepoPolicyComplianceClient", repo_policy_compliance_cls_mock - ) - - openstack_manager_for_reconcile.reconcile(quantity=1) - - cloud_init_data_str = patched_create_connection_context.create_server.call_args[1]["userdata"] - repo_policy_compliance_client_mock.get_one_time_token.assert_called_once() - assert one_time_token in cloud_init_data_str - assert test_url in cloud_init_data_str - - -def test__ensure_security_group_with_existing_rules(): - """ - arrange: Mock OpenStack connection with the security rules created. - act: Run `_ensure_security_group`. - assert: The security rules are not created again. - """ - connection_mock = MagicMock(spec=openstack.connection.Connection) - connection_mock.list_security_groups.return_value = [ - { - "security_group_rules": [ - {"protocol": "icmp"}, - {"protocol": "tcp", "port_range_min": 22, "port_range_max": 22}, - {"protocol": "tcp", "port_range_min": 10022, "port_range_max": 10022}, - ], - "id": "TEST_ID", - } - ] - - openstack_manager.OpenstackRunnerManager._ensure_security_group(connection_mock) - connection_mock.create_security_group_rule.assert_not_called() - - -def test__get_ssh_connection( - monkeypatch, - patch_get_ssh_connection_health_check, - mock_server: MagicMock, -): - """ - arrange: A server with SSH setup correctly. - act: Get the SSH connections. - assert: The SSH connections contains at least one connection. - """ - # Patching the `_get_key_path` to get around the keyfile checks. - mock__get_key_path = MagicMock(spec=openstack_manager.OpenstackRunnerManager._get_key_path) - mock_key_path = MagicMock(spec=Path) - mock_key_path.exists.return_value = True - mock__get_key_path.return_value = mock_key_path - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, "_get_key_path", mock__get_key_path - ) - mock_connection = MagicMock(spec=openstack.connection.Connection) - mock_connection.get_server.return_value = mock_server - - conn = openstack_manager.OpenstackRunnerManager._get_ssh_connection( - mock_connection, mock_server.name - ) - assert conn is not None - - -@pytest.mark.usefixtures("skip_retry") -def test__ssh_health_check_success(monkeypatch: pytest.MonkeyPatch, mock_server: MagicMock): - """ - arrange: A server with SSH correctly setup. - act: Run health check on the server. - assert: The health check passes. - """ - ssh_connection_mock = MagicMock() - result_mock = MagicMock() - result_mock.stdout = "/home/ubuntu/actions-runner/run.sh\nRunner.Worker" - ssh_connection_mock.run.return_value = result_mock - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - MagicMock(return_value=ssh_connection_mock), - ) - mock_connection = MagicMock(spec=openstack.connection.Connection) - mock_connection.get_server.return_value = mock_server - - assert openstack_manager.OpenstackRunnerManager._ssh_health_check( - mock_connection, mock_server.name, False - ) - - -@pytest.mark.usefixtures("skip_retry") -def test__ssh_health_check_no_key(mock_server: MagicMock): - """ - arrange: A server with no key available. - act: Run health check on the server. - assert: The health check fails. - """ - # Remove the mock SSH key. - mock_server.key_name = None - - mock_connection = MagicMock(spec=openstack.connection.Connection) - mock_connection.get_server.return_value = mock_server - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._ssh_health_check( - mock_connection, mock_server.name, False - ) - - assert "no valid keypair found" in str(exc) - - -@pytest.mark.usefixtures("skip_retry") -def test__ssh_health_check_error(monkeypatch: pytest.MonkeyPatch, mock_server: MagicMock): - """ - arrange: A server with error on SSH run. - act: Run health check on the server. - assert: The health check fails. - """ - monkeypatch.setattr(openstack_manager.OpenstackRunnerManager, "_get_key_path", MagicMock()) - mock_connection = MagicMock(spec=openstack.connection.Connection) - mock_connection.get_server.return_value = mock_server - mock_ssh_connection = MagicMock() - mock_ssh_connection.run = MagicMock(side_effect=TimeoutError) - monkeypatch.setattr( - openstack_manager, "SSHConnection", MagicMock(return_value=mock_ssh_connection) - ) - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._ssh_health_check( - mock_connection, mock_server.name, False - ) - - assert "No connectable SSH addresses found" in str(exc) - - -def test__wait_until_runner_process_running_no_server(): - """ - arrange: No server existing on the OpenStack connection. - act: Check if runner process is running. - assert: RunnerStartError thrown. - """ - mock_connection = MagicMock(spec=openstack.connection.Connection) - mock_connection.get_server.return_value = None - - with pytest.raises(RunnerStartError): - openstack_manager.OpenstackRunnerManager._wait_until_runner_process_running( - mock_connection, "Non-existing-server" - ) - - -@pytest.mark.parametrize( - "server", - [ - pytest.param(None, id="no server"), - pytest.param(factories.MockOpenstackServer(status="SHUTOFF"), id="shutoff"), - pytest.param(factories.MockOpenstackServer(status="REBUILD"), id="not active/building"), - ], -) -def test__health_check(server: factories.MockOpenstackServer | None): - """ - arrange: given a mock openstack.get_server response. - act: when _health_check is called. - assert: False is returned, meaning unhealthy runner. - """ - mock_get_server = MagicMock(return_value=server) - mock_connection = MagicMock() - mock_connection.get_server = mock_get_server - - assert not openstack_manager.OpenstackRunnerManager._health_check( - conn=mock_connection, server_name="test" - ) - - -# The SSH health check will temporarily return True on failure for debugging purposes. -@pytest.mark.xfail -def test__ssh_health_check_connection_error(monkeypatch: pytest.MonkeyPatch): - """ - arrange: given a monkeypatched _get_ssh_connection function that raises _SSHError. - act: when _ssh_health_check is called. - assert: False is returned, meaning unhealthy runner. - """ - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - MagicMock(side_effect=openstack_manager._SSHError), - ) - - assert not openstack_manager.OpenstackRunnerManager._ssh_health_check( - server=MagicMock(), startup=False - ) - - -@pytest.mark.parametrize( - "result", - [ - pytest.param(factories.MockSSHRunResult(exited=1), id="ssh result not ok"), - pytest.param( - factories.MockSSHRunResult(exited=0, stdout=""), - id="runner process not found in stdout", - ), - # This health check should fail but temporarily marking as passing for passive runner - # deletion until we have more data. - pytest.param( - factories.MockSSHRunResult(exited=0, stdout="/home/ubuntu/actions-runner/run.sh"), - id="startup process exists but no listener or worker process", - ), - ], -) -@pytest.mark.xfail -def test__ssh_health_check_unhealthy( - monkeypatch: pytest.MonkeyPatch, result: factories.MockSSHRunResult -): - """ - arrange: given unhealthy ssh responses. - act: when _ssh_health_check is called. - assert: False is returned, meaning unhealthy runner. - """ - mock_ssh_connection = MagicMock() - mock_ssh_connection.run = MagicMock(return_value=result) - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - MagicMock(return_value=mock_ssh_connection), - ) - - assert not openstack_manager.OpenstackRunnerManager._ssh_health_check( - server=MagicMock(), startup=False - ) - - -@pytest.mark.parametrize( - "result, startup", - [ - pytest.param( - factories.MockSSHRunResult( - exited=0, stdout="/home/ubuntu/actions-runner/run.sh\nRunner.Worker" - ), - False, - id="runner process & workper process found", - ), - pytest.param( - factories.MockSSHRunResult( - exited=0, stdout="/home/ubuntu/actions-runner/run.sh\nRunner.Listener" - ), - False, - id="runner process & listener process found", - ), - pytest.param( - factories.MockSSHRunResult(exited=0, stdout="/home/ubuntu/actions-runner/run.sh"), - True, - id="runner process found for startup", - ), - ], -) -def test__ssh_health_check_healthy( - monkeypatch: pytest.MonkeyPatch, result: factories.MockSSHRunResult, startup: bool -): - """ - arrange: given healthy ssh response. - act: when _ssh_health_check is called. - assert: True is returned, meaning healthy runner. - """ - mock_ssh_connection = MagicMock() - mock_ssh_connection.run = MagicMock(return_value=result) - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - MagicMock(return_value=mock_ssh_connection), - ) - - assert openstack_manager.OpenstackRunnerManager._ssh_health_check( - conn=MagicMock(), server_name=MagicMock(), startup=startup - ) - - -@pytest.mark.usefixtures("skip_retry") -def test__get_ssh_connection_server_gone(): - """ - arrange: given a mock Openstack get_server function that returns None. - act: when _get_ssh_connection is called. - assert: _SSHError is raised. - """ - mock_connection = MagicMock() - mock_connection.get_server.return_value = None - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - - assert "Server gone while trying to get SSH connection" in str(exc.getrepr()) - - -@pytest.mark.usefixtures("skip_retry") -def test__get_ssh_connection_no_server_key(): - """ - arrange: given a mock server instance with no key attached. - act: when _get_ssh_connection is called. - assert: _SSHError is raised. - """ - mock_server = MagicMock() - mock_server.key_name = None - mock_connection = MagicMock() - mock_connection.get_server.return_value = mock_server - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - - assert "Unable to create SSH connection, no valid keypair found" in str(exc.getrepr()) - - -@pytest.mark.usefixtures("skip_retry") -def test__get_ssh_connection_key_not_exists(monkeypatch: pytest.MonkeyPatch): - """ - arrange: given a monkeypatched _get_key_path function that returns a non-existent path. - act: when _get_ssh_connection is called. - assert: _SSHError is raised. - """ - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_key_path", - MagicMock(return_value=Path("does-not-exist")), - ) - mock_connection = MagicMock() - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - - assert "Missing keyfile for server" in str(exc.getrepr()) - - -@pytest.mark.usefixtures("skip_retry") -def test__get_ssh_connection_server_no_addresses(monkeypatch: pytest.MonkeyPatch): - """ - arrange: given a mock server instance with no server addresses. - act: when _get_ssh_connection is called. - assert: _SSHError is raised. - """ - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_key_path", - MagicMock(return_value=Path(".")), - ) - mock_server = MagicMock() - mock_server.addresses = {} - mock_connection = MagicMock() - mock_connection.get_server.return_value = mock_server - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - - assert "No addresses found for OpenStack server" in str(exc.getrepr()) - - -@pytest.mark.usefixtures("skip_retry") -@pytest.mark.parametrize( - "run", - [ - pytest.param(MagicMock(side_effect=TimeoutError), id="timeout error"), - pytest.param( - MagicMock(return_value=factories.MockSSHRunResult(exited=1)), id="result not ok" - ), - pytest.param( - MagicMock(return_value=factories.MockSSHRunResult(exited=0, stdout="")), - id="empty response", - ), - ], -) -def test__get_ssh_connection_server_no_valid_connections( - monkeypatch: pytest.MonkeyPatch, run: MagicMock -): - """ - arrange: given a monkeypatched Connection instance that returns invalid connections. - act: when _get_ssh_connection is called. - assert: _SSHError is raised. - """ - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_key_path", - MagicMock(return_value=Path(".")), - ) - mock_server = MagicMock() - mock_server.addresses = {"test": [{"addr": "test-address"}]} - mock_connection = MagicMock() - mock_connection.get_server.return_value = mock_server - mock_ssh_connection = MagicMock() - mock_ssh_connection.run = run - monkeypatch.setattr( - openstack_manager, "SSHConnection", MagicMock(return_value=mock_ssh_connection) - ) - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - - assert "No connectable SSH addresses found" in str(exc.getrepr()) - - -@pytest.mark.usefixtures("skip_retry") -def test__get_ssh_connection_server(monkeypatch: pytest.MonkeyPatch): - """ - arrange: given monkeypatched SSH connection instance. - act: when _get_ssh_connection is called. - assert: the SSH connection instance is returned. - """ - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_key_path", - MagicMock(return_value=Path(".")), - ) - mock_server = MagicMock() - mock_server.addresses = {"test": [{"addr": "test-address"}]} - mock_connection = MagicMock() - mock_connection.get_server.return_value = mock_server - mock_ssh_connection = MagicMock() - mock_ssh_connection.run = MagicMock( - return_value=factories.MockSSHRunResult(exited=0, stdout="hello world") - ) - monkeypatch.setattr( - openstack_manager, "SSHConnection", MagicMock(return_value=mock_ssh_connection) - ) - - assert ( - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - == mock_ssh_connection - ) - - -def test_flush(monkeypatch: pytest.MonkeyPatch): - """ - arrange: given monkeypatched sub functions of flush. - act: when flush is called. - assert: sub functions are called. - """ - monkeypatch.setattr(openstack_manager, "_create_connection", MagicMock()) - monkeypatch.setattr(openstack_manager, "set_env_var", MagicMock()) - runner_manager = openstack_manager.OpenstackRunnerManager( - app_name=MagicMock(), - unit_num=MagicMock(), - openstack_runner_manager_config=MagicMock(), - cloud_config=MagicMock(), - ) - runner_manager._kill_runner_processes = MagicMock() - runner_manager._get_openstack_runner_status = MagicMock() - runner_manager._github = MagicMock() - runner_manager._remove_runners = MagicMock() - - runner_manager.flush(mode=MagicMock()) - - runner_manager._kill_runner_processes.assert_called() - runner_manager._get_openstack_runner_status.assert_called() - runner_manager._github.get_runner_remove_token.assert_called() - runner_manager._remove_runners.assert_called() - - -@pytest.mark.parametrize( - "flush_mode, expected_command", - [ - pytest.param( - FlushMode.FLUSH_BUSY, - "pgrep -x Runner.Listener && kill $(pgrep -x Runner.Listener);" - "pgrep -x Runner.Worker && kill $(pgrep -x Runner.Worker);", - id="Flush Busy", - ), - pytest.param( - FlushMode.FLUSH_IDLE, - "! pgrep -x Runner.Worker && pgrep -x Runner.Listener && " - "kill $(pgrep -x Runner.Listener)", - id="Flush Idle", - ), - ], -) -def test__kill_runner_processes( - monkeypatch: pytest.MonkeyPatch, flush_mode: FlushMode, expected_command: str -): - """ - arrange: given supported flush modes. - act: when _kill_runner_processes is called. - assert: expected kill commands are issued. - """ - monkeypatch.setattr(openstack_manager, "_create_connection", MagicMock()) - monkeypatch.setattr(openstack_manager, "set_env_var", MagicMock()) - runner_manager = openstack_manager.OpenstackRunnerManager( - app_name=MagicMock(), - unit_num=MagicMock(), - openstack_runner_manager_config=MagicMock(), - cloud_config=MagicMock(), - ) - runner_manager._get_openstack_instances = MagicMock(return_value=[MagicMock(), MagicMock()]) - mock_connection = MagicMock() - runner_manager._get_ssh_connection = MagicMock(return_value=mock_connection) - - runner_manager._kill_runner_processes(conn=MagicMock(), mode=flush_mode) - - mock_connection.run.assert_called_with(expected_command, warn=True) diff --git a/tests/unit/test_runner.py b/tests/unit/test_runner.py index fdf8fc2a1..af7954d06 100644 --- a/tests/unit/test_runner.py +++ b/tests/unit/test_runner.py @@ -13,7 +13,7 @@ from _pytest.monkeypatch import MonkeyPatch import metrics.runner_logs -from charm_state import GithubOrg, GithubRepo, SSHDebugConnection, VirtualMachineResources +from charm_state import GitHubOrg, GitHubRepo, SSHDebugConnection, VirtualMachineResources from errors import ( CreateMetricsStorageError, LxdError, @@ -138,11 +138,11 @@ def ssh_debug_connections_fixture() -> list[SSHDebugConnection]: name="runner", params=[ ( - GithubOrg("test_org", "test_group"), + GitHubOrg("test_org", "test_group"), ProxySetting(no_proxy=None, http=None, https=None, aproxy_address=None), ), ( - GithubRepo("test_owner", "test_repo"), + GitHubRepo("test_owner", "test_repo"), ProxySetting( no_proxy="test_no_proxy", http=TEST_PROXY_SERVER_URL, diff --git a/tests/unit/test_runner_scaler.py b/tests/unit/test_runner_scaler.py new file mode 100644 index 000000000..845c8da49 --- /dev/null +++ b/tests/unit/test_runner_scaler.py @@ -0,0 +1,266 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + + +from typing import Iterable +from unittest.mock import MagicMock + +import pytest + +from charm_state import GitHubPath, GitHubRepo +from manager.cloud_runner_manager import CloudRunnerState, InstanceId +from manager.github_runner_manager import GitHubRunnerState +from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig +from manager.runner_scaler import RunnerScaler +from tests.unit.mock_runner_managers import ( + MockCloudRunnerManager, + MockGitHubRunnerManager, + SharedMockRunnerManagerState, +) + + +def mock_runner_manager_spawn_runners( + create_runner_args: Iterable[RunnerManager._CreateRunnerArgs], +) -> tuple[InstanceId, ...]: + """Mock _spawn_runners method of RunnerManager. + + The _spawn_runners method uses multi-process, which copies the object, e.g., the mocks. + There is easy way to sync the state of the mocks object across processes. Replacing the + _spawn_runner to remove the multi-process.pool is an easier approach. + + Args: + create_runner_args: The arguments for the create_runner method. + + Returns: + The instance ids of the runner spawned. + """ + return tuple(RunnerManager._create_runner(arg) for arg in create_runner_args) + + +@pytest.fixture(scope="function", name="github_path") +def github_path_fixture() -> GitHubPath: + return GitHubRepo("mock_owner", "mock_repo") + + +@pytest.fixture(scope="function", name="mock_runner_managers") +def mock_runner_managers_fixture( + github_path: GitHubPath, +) -> tuple[MockCloudRunnerManager, MockGitHubRunnerManager]: + state = SharedMockRunnerManagerState() + mock_cloud = MockCloudRunnerManager(state) + mock_github = MockGitHubRunnerManager(mock_cloud.name_prefix, github_path, state) + return (mock_cloud, mock_github) + + +@pytest.fixture(scope="function", name="runner_manager") +def runner_manager_fixture( + monkeypatch, mock_runner_managers, github_path: GitHubPath +) -> RunnerManager: + mock_cloud, mock_github = mock_runner_managers + monkeypatch.setattr( + "manager.runner_manager.RunnerManager._spawn_runners", mock_runner_manager_spawn_runners + ) + # Patch out the metrics, as metrics has their own tests. + monkeypatch.setattr("manager.runner_manager.github_metrics.job", MagicMock()) + monkeypatch.setattr("manager.runner_manager.runner_metrics.issue_events", MagicMock()) + + config = RunnerManagerConfig("mock_token", github_path) + runner_manager = RunnerManager("mock_runners", mock_cloud, config) + runner_manager._github = mock_github + return runner_manager + + +@pytest.fixture(scope="function", name="runner_scaler") +def runner_scaler_fixture(runner_manager: RunnerManager) -> RunnerScaler: + return RunnerScaler(runner_manager, None) + + +@pytest.fixture(scope="function", name="runner_scaler_one_runner") +def runner_scaler_one_runner_fixture(runner_scaler: RunnerScaler) -> RunnerScaler: + runner_scaler.reconcile(1) + assert_runner_info(runner_scaler, online=1) + return runner_scaler + + +def set_one_runner_state( + runner_scaler: RunnerScaler, + github_state: GitHubRunnerState | None = None, + cloud_state: CloudRunnerState | None = None, +): + """Set the runner state for a RunnerScaler with one runner. + + Args: + runner_scaler: The RunnerScaler instance to modify. + github_state: The github state to set the runner. + cloud_state: The cloud state to set the runner. + """ + runner_dict = runner_scaler._manager._github.state.runners + assert len(runner_dict) == 1, "Test arrange failed: One runner should be present" + instance_id = list(runner_dict.keys())[0] + if github_state is not None: + runner_dict[instance_id].github_state = github_state + if cloud_state is not None: + runner_dict[instance_id].cloud_state = cloud_state + + +def assert_runner_info( + runner_scaler: RunnerScaler, online: int = 0, busy: int = 0, offline: int = 0, unknown: int = 0 +) -> None: + """Assert runner info contains a certain amount of runners. + + Args: + runner_scaler: The RunnerScaler to get information from. + online: The number of online runners to assert for. + busy: The number of buys runners to assert for. + offline: The number of offline runners to assert for. + unknown: The number of unknown runners to assert for. + """ + info = runner_scaler.get_runner_info() + assert info.offline == offline + assert info.online == online + assert info.busy == busy + assert info.unknown == unknown + assert isinstance(info.runners, tuple) + assert len(info.runners) == online + assert isinstance(info.busy_runners, tuple) + assert len(info.busy_runners) == busy + + +def test_get_no_runner(runner_scaler: RunnerScaler): + """ + Arrange: A RunnerScaler with no runners. + Act: Get runner information. + Assert: Information should contain no runners. + """ + assert_runner_info(runner_scaler, online=0) + + +def test_flush_no_runner(runner_scaler: RunnerScaler): + """ + Arrange: A RunnerScaler with no runners. + Act: + 1. Flush idle runners. + 2. Flush busy runners. + Assert: + 1. No change in number of runners. Runner info should contain no runners. + 2. No change in number of runners. + """ + # 1. + diff = runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) + assert diff == 0 + assert_runner_info(runner_scaler, online=0) + + # 2. + diff = runner_scaler.flush(flush_mode=FlushMode.FLUSH_BUSY) + assert diff == 0 + assert_runner_info(runner_scaler, online=0) + + +def test_reconcile_runner_create_one(runner_scaler: RunnerScaler): + """ + Arrange: A RunnerScaler with no runners. + Act: Reconcile to no runners. + Assert: No changes. Runner info should contain no runners. + """ + diff = runner_scaler.reconcile(quantity=0) + assert diff == 0 + assert_runner_info(runner_scaler, online=0) + + +def test_one_runner(runner_scaler: RunnerScaler): + """ + Arrange: A RunnerScaler with no runners. + Act: + 1. Reconcile to one runner. + 2. Reconcile to one runner. + 3. Flush idle runners. + 4. Reconcile to one runner. + Assert: + 1. Runner info has one runner. + 2. No changes to number of runner. + 3. No runners. + 4. Runner info has one runner. + """ + # 1. + diff = runner_scaler.reconcile(1) + assert diff == 1 + assert_runner_info(runner_scaler, online=1) + + # 2. + diff = runner_scaler.reconcile(1) + assert diff == 0 + assert_runner_info(runner_scaler, online=1) + + # 3. + runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) + assert_runner_info(runner_scaler, online=0) + + # 4. + diff = runner_scaler.reconcile(1) + assert diff == 1 + assert_runner_info(runner_scaler, online=1) + + +def test_flush_busy_on_idle_runner(runner_scaler_one_runner: RunnerScaler): + """ + Arrange: A RunnerScaler with one idle runner. + Act: Run flush busy runner. + Assert: No runners. + """ + runner_scaler = runner_scaler_one_runner + + runner_scaler.flush(flush_mode=FlushMode.FLUSH_BUSY) + assert_runner_info(runner_scaler, online=0) + + +def test_flush_busy_on_busy_runner( + runner_scaler_one_runner: RunnerScaler, +): + """ + Arrange: A RunnerScaler with one busy runner. + Act: Run flush busy runner. + Assert: No runners. + """ + runner_scaler = runner_scaler_one_runner + set_one_runner_state(runner_scaler, GitHubRunnerState.BUSY) + + runner_scaler.flush(flush_mode=FlushMode.FLUSH_BUSY) + assert_runner_info(runner_scaler, online=0) + + +def test_get_runner_one_busy_runner( + runner_scaler_one_runner: RunnerScaler, +): + """ + Arrange: A RunnerScaler with one busy runner. + Act: Run get runners. + Assert: One busy runner. + """ + runner_scaler = runner_scaler_one_runner + set_one_runner_state(runner_scaler, GitHubRunnerState.BUSY) + + assert_runner_info(runner_scaler=runner_scaler, online=1, busy=1) + + +def test_get_runner_offline_runner(runner_scaler_one_runner: RunnerScaler): + """ + Arrange: A RunnerScaler with one offline runner. + Act: Run get runners. + Assert: One offline runner. + """ + runner_scaler = runner_scaler_one_runner + set_one_runner_state(runner_scaler, GitHubRunnerState.OFFLINE) + + assert_runner_info(runner_scaler=runner_scaler, offline=1) + + +def test_get_runner_unknown_runner(runner_scaler_one_runner: RunnerScaler): + """ + Arrange: A RunnerScaler with one offline runner. + Act: Run get runners. + Assert: One offline runner. + """ + runner_scaler = runner_scaler_one_runner + set_one_runner_state(runner_scaler, "UNKNOWN") + + assert_runner_info(runner_scaler=runner_scaler, unknown=1) From d4fdc6985523eefc7f46dca16913550d03158a52 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Fri, 6 Sep 2024 07:34:51 +0200 Subject: [PATCH 04/10] Set proxy_env_vars in OpenStackRunnerManager (#363) * set proxy_env_vars in OpenStackRunnerManager * lint * disable aproxy in private endpoint runners * disable aproxy conditionally * enable log_cli * temporarily disable env vars to test if integration test fails - REVERT ME * disable aproxy unconditionally * Revert latest commits Revert "disable aproxy in private endpoint runners" This reverts commit acfc35ab168e4f6b62328b958cbc24ed2ed368dd. Revert "disable aproxy conditionally" This reverts commit e45878e370987c9e34cc265e35ca3c3c5570f819. Revert "enable log_cli" This reverts commit f8207a7a73c2ebb2c7286e66b8917065ae68ae70. Revert "temporarily disable env vars to test if integration test fails - REVERT ME" This reverts commit e0dca27e309d192b6412be2c6e4a992ac2bc22bc. Revert "disable aproxy unconditionally" This reverts commit c47ef6e53810ed67b7b0b5ad7bf80b4e14f20d78. --- src-docs/openstack_cloud.openstack_runner_manager.md | 12 ++++++------ src/openstack_cloud/openstack_runner_manager.py | 11 ++++++++++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 7f28b8689..64e7ce91d 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -133,7 +133,7 @@ The prefix of runner names. --- - + ### method `cleanup` @@ -156,7 +156,7 @@ Cleanup runner and resource on the cloud. --- - + ### method `create_runner` @@ -186,7 +186,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -210,7 +210,7 @@ Delete self-hosted runners. --- - + ### method `flush_runners` @@ -233,7 +233,7 @@ Remove idle and/or busy runners. --- - + ### method `get_runner` @@ -256,7 +256,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py index aa03b0ec3..11bac0b92 100644 --- a/src/openstack_cloud/openstack_runner_manager.py +++ b/src/openstack_cloud/openstack_runner_manager.py @@ -42,7 +42,7 @@ from metrics import storage as metrics_storage from openstack_cloud.openstack_cloud import OpenstackCloud, OpenstackInstance from repo_policy_compliance_client import RepoPolicyComplianceClient -from utilities import retry +from utilities import retry, set_env_var logger = logging.getLogger(__name__) @@ -149,6 +149,15 @@ def __init__( # pylint: disable=R0913 prefix=self.name_prefix, ) + # Setting the env var to this process and any child process spawned. + proxies = service_config.proxy_config + if no_proxy := proxies.no_proxy: + set_env_var("NO_PROXY", no_proxy) + if http_proxy := proxies.http: + set_env_var("HTTP_PROXY", http_proxy) + if https_proxy := proxies.https: + set_env_var("HTTPS_PROXY", https_proxy) + @property def name_prefix(self) -> str: """The prefix of runner names. From 0292abbef10bac5e0a45fd7dea3460aa2badef48 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Mon, 9 Sep 2024 15:40:46 +0200 Subject: [PATCH 05/10] Externalise Runner Manager (#358) * Spawn a manual test env * Disable spawning on manual test env * Remove useless class * Fix runner deletion * Fix import error * Add more docs * Fix get no-existing openstack server * Add debug statement * Fix variable name and function name mixup * Fix id variable name, function name mixup * Add debug statement * Move debug * Add busy runner test * Add debug statement. * Disable some test * Disable some test * Fix runner label in workflow * Fix lambda * Debug * Debug * Add debug * Start new manual test env * Add none check * Fix missing prefix * Add more logging * Refactor runner manager one runner fixture * Fix error string formatting * Adding the docstring for github_runner_manager * Fix test fixture scope * Add docstring on cloud_runner_manager * Add debug * Fix docstring for cloud runner manager * Add more docstrings * Add metrics for deleted and cleanup runners * Enable tests again * Add debug * Get runner info not on GitHub * Fix dict access * Add debug of userdata * Fix metric path * Debug metric * Fix variable naming * Test * Fix iterator * Debug * Debug * Fix for iterator return value * Add more log path patching * Fix path naming * Fix monkey patch * Start a arm64 manual test env * Not spawning manual test env * Update fmt * Fix metric storage implementation for openstack * Fix metric storage provider usage in openstack runner manager * Debug * Fix iterator * Add debug * Fix None in iterator * Add debug * Trying fix for get runner filter * Add test * Patch the path for logs * Add cleanup test * Debug * Fix github state determining busy runner * Fix wrong naming for method in ABC * Remove debugging * Add more docstrings * Fix runner deletion * Add more docs * Fix typing * Debug * Update SSH health check * Tmp disable a passing test * Add deubg * Remove a debug * Fix Cloud runner state init * Change clean up to cleanup * Fix attr naming issue in openstack cloud * Fix reference to non-existing instance_name in openstack cloud * Add metric log processing to test * Enable all tests * Fix health check return value * Fix all flake8 lints * Fix test * Fix all lints * Fix unit test issue due to method sig change * Ignore openstack cloud from coverage due to the test requires private endpoint * Enable all tests * Remove a repeated test * Re-enable test.yaml * Fix integration tests workflwo * Add docs on cleanup method of cloud runner manager * Add parallel spawning of runners. * Enable dev testing * Fix parallel spawn * Allow openstack server to take a bit of time on deletion * Refactor test detection of no runners * Re-enable the tests * Fix lints * Disable tests again * Disable some test * Add wait until runner is running * Enable openstack runner manager tests * Add debug * Wait for github state * Refactor wait until runner spawn * Add keyfile erorr * Remove debug statement * Re-enable all tests * Update src/manager/github_runner_manager.py Co-authored-by: Yanks Yoon <37652070+yanksyoon@users.noreply.github.com> * Update src/openstack_cloud/openstack_cloud.py Co-authored-by: Yanks Yoon <37652070+yanksyoon@users.noreply.github.com> * Suggestions * Refactor remove openstack server * Test spawning two runners. * Fix test * Fix naming * Fix according comment * Fix clouds yaml write issue. * Fix format * Add delete runner by amount * Add getting runner health state for metrics * Fix security group ID issues * Fix according to review * Refactor health state for runner * Fix lint issues * Add missing docs * Update the github state enum to use auto * Rename class to fit convension * Fix according to review * Fix name_prefix property cloud runner manager * Add class for scaling runners * Fix lints * Fix unit test * Fix according to review comment * Fix test according to comments * Fix unit test * Fix typo of attr * Add debug * Add debug statement * Debug * Fix return code of the kill command * Remove debug * Add comments on the flush kill command * Add debug * Fix debug * Debug * Debug * Remove debug * Add cleanup during idle and busy runner test * Debug * Disable tests during debug * Debug missing keyfiles * Fix keyfile path matching issue * testing * debug * Add debug * Use OR * debug * Debug * Debug * Debug * Debug * Debug * Fix flush mode * Remove debug * Re-enable all tests * Initial unit test for runner scaler * Add more unit tests for runner scaler * Add more tests * Fix merge issues * Fix states in get_runners methods * Add docstring for unit test mocks * Fix construction of repo-policy-compliance from config * Fix get_runners action output * Fix the lints * Fix a naming issue * Fix naming prefix of runner * Improve unit test * Remove the old OpenstackRunnerManager * Fix test contstruction of runner manager. * Fix flavor naming * Fix flush action result output. * Fix flavor of metric * Testing out a integration test fikx * change flush runner to flush idle. * Add debug in integration test * Manual test mode * Start new manual test env * Spawn x64 manual test env. * Improve logging during reconcile * Fix crashed metric collection * Remove debug workflow * Format * Test * externalise * fix tests * Add reactive back in * Fix flushing of runners * Debug workflow * Add debug * Fix logging of health state * Remove debug * Debug * Fix set contruction * Fix SSH key path in integration test setup * Add more checks to repo-policy-compliance setup in tests * Fix key path check * Fix format string issue * Fix format string typo * fix integration test import * outcomment externalised workflow * Add some logging of test setup * Fix missing await * Revert config-change flushing * Add maintance status for image relation change * Fix HTTP format * Revert "outcomment externalised workflow" This reverts commit e0a78af914a097814b3a4c1df013da5e40bf9ac8. * re-checkin integration test * use github types from externalised app * use github types from externalised app * Update coverage ignore of github_runner_manager * Minor fix in test comments * lint and fix unit tests * fix merge * remove reactive script * fix merge * remove unused OpenstackUnauthorizedError * final new line * remove code duplication * remove duplicate src-docs * remove openstack-userdata.sh.j2 * pin commit in github-runner-manager --------- Co-authored-by: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Co-authored-by: Yanks Yoon <37652070+yanksyoon@users.noreply.github.com> --- pyproject.toml | 2 +- requirements.txt | 1 + scripts/reactive_runner.py | 50 -- src-docs/charm.md | 10 +- .../{charm_state.py.md => charm_state.md} | 589 +++++++------ src-docs/errors.md | 232 +---- src-docs/github_client.md | 169 +--- src-docs/runner.md | 16 +- src-docs/runner_manager.md | 6 +- src-docs/runner_type.md | 10 +- src-docs/shared_fs.md | 8 +- src-docs/utilities.md | 100 +-- src/charm.py | 33 +- src/charm_state.py | 80 +- src/errors.py | 94 +- src/github_client.py | 250 +----- src/github_type.py | 164 ---- src/logrotate.py | 4 +- src/manager/cloud_runner_manager.py | 203 ----- src/manager/github_runner_manager.py | 133 --- src/manager/runner_manager.py | 364 -------- src/manager/runner_scaler.py | 215 ----- src/metrics/__init__.py | 4 - src/metrics/events.py | 167 ---- src/metrics/github.py | 53 -- src/metrics/runner.py | 470 ---------- src/metrics/runner_logs.py | 54 -- src/metrics/storage.py | 192 ---- src/metrics/type.py | 23 - src/openstack_cloud/__init__.py | 78 -- src/openstack_cloud/openstack_cloud.py | 597 ------------- .../openstack_runner_manager.py | 830 ------------------ src/reactive/__init__.py | 4 - src/reactive/consumer.py | 112 --- src/reactive/runner_manager.py | 141 --- src/repo_policy_compliance_client.py | 73 -- src/runner.py | 7 +- src/runner_manager.py | 16 +- src/runner_manager_type.py | 6 +- src/runner_type.py | 4 +- src/shared_fs.py | 3 +- src/utilities.py | 149 +--- templates/openstack-userdata.sh.j2 | 105 --- tests/integration/helpers/charm_metrics.py | 6 +- tests/integration/helpers/openstack.py | 2 +- .../integration/test_charm_metrics_failure.py | 4 +- .../integration/test_charm_metrics_success.py | 2 +- tests/integration/test_reactive.py | 4 +- .../test_runner_manager_openstack.py | 31 +- tests/integration/test_self_hosted_runner.py | 2 +- tests/unit/conftest.py | 12 +- tests/unit/metrics/__init__.py | 2 - tests/unit/metrics/test_events.py | 57 -- tests/unit/metrics/test_github.py | 70 -- tests/unit/metrics/test_runner.py | 649 -------------- tests/unit/metrics/test_runner_logs.py | 31 - tests/unit/metrics/test_storage.py | 168 ---- tests/unit/mock.py | 3 +- tests/unit/mock_runner_managers.py | 13 +- tests/unit/reactive/__init__.py | 2 - tests/unit/reactive/test_consumer.py | 89 -- tests/unit/reactive/test_runner_manager.py | 175 ---- tests/unit/test_charm.py | 6 +- tests/unit/test_charm_state.py | 21 +- tests/unit/test_github_client.py | 208 ----- tests/unit/test_lxd_runner_manager.py | 21 +- tests/unit/test_openstack_cloud.py | 41 - tests/unit/test_runner.py | 13 +- tests/unit/test_runner_scaler.py | 25 +- tests/unit/test_shared_fs.py | 2 +- 70 files changed, 528 insertions(+), 6952 deletions(-) delete mode 100644 scripts/reactive_runner.py rename src-docs/{charm_state.py.md => charm_state.md} (71%) delete mode 100644 src/github_type.py delete mode 100644 src/manager/cloud_runner_manager.py delete mode 100644 src/manager/github_runner_manager.py delete mode 100644 src/manager/runner_manager.py delete mode 100644 src/manager/runner_scaler.py delete mode 100644 src/metrics/__init__.py delete mode 100644 src/metrics/events.py delete mode 100644 src/metrics/github.py delete mode 100644 src/metrics/runner.py delete mode 100644 src/metrics/runner_logs.py delete mode 100644 src/metrics/storage.py delete mode 100644 src/metrics/type.py delete mode 100644 src/openstack_cloud/__init__.py delete mode 100644 src/openstack_cloud/openstack_cloud.py delete mode 100644 src/openstack_cloud/openstack_runner_manager.py delete mode 100644 src/reactive/__init__.py delete mode 100644 src/reactive/consumer.py delete mode 100644 src/reactive/runner_manager.py delete mode 100644 src/repo_policy_compliance_client.py delete mode 100644 templates/openstack-userdata.sh.j2 delete mode 100644 tests/unit/metrics/__init__.py delete mode 100644 tests/unit/metrics/test_events.py delete mode 100644 tests/unit/metrics/test_github.py delete mode 100644 tests/unit/metrics/test_runner.py delete mode 100644 tests/unit/metrics/test_runner_logs.py delete mode 100644 tests/unit/metrics/test_storage.py delete mode 100644 tests/unit/reactive/__init__.py delete mode 100644 tests/unit/reactive/test_consumer.py delete mode 100644 tests/unit/reactive/test_runner_manager.py delete mode 100644 tests/unit/test_github_client.py delete mode 100644 tests/unit/test_openstack_cloud.py diff --git a/pyproject.toml b/pyproject.toml index d16bac3a9..f4a49bd2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ omit = [ ] [tool.coverage.report] -fail_under = 85 +fail_under = 83 show_missing = true [tool.pytest.ini_options] diff --git a/requirements.txt b/requirements.txt index 541c0d4c9..4d219d184 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ PyYAML ==6.0.* pyOpenSSL==24.2.1 kombu==5.4.0 pymongo==4.8.0 +github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@1f310b22b99a94bd5429184191558426b014ee82 diff --git a/scripts/reactive_runner.py b/scripts/reactive_runner.py deleted file mode 100644 index e9b996ff6..000000000 --- a/scripts/reactive_runner.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Script to spawn a reactive runner process.""" -import logging -import os -import sys - -from reactive.consumer import consume -from reactive.runner_manager import MQ_URI_ENV_VAR, QUEUE_NAME_ENV_VAR - - -def setup_root_logging() -> None: - """Set up logging for the reactive runner process.""" - # setup root logger to log in a file which will be picked up by grafana agent and sent to Loki - logging.basicConfig( - stream=sys.stdout, - level=logging.DEBUG, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - ) - - -def main() -> None: - """Spawn a process that consumes a message from the queue to create a runner. - - Raises: - ValueError: If the required environment variables are not set - """ - mq_uri = os.environ.get(MQ_URI_ENV_VAR) - queue_name = os.environ.get(QUEUE_NAME_ENV_VAR) - - if not mq_uri: - raise ValueError( - f"Missing {MQ_URI_ENV_VAR} environment variable. " - "Please set it to the message queue URI." - ) - - if not queue_name: - raise ValueError( - f"Missing {QUEUE_NAME_ENV_VAR} environment variable. " - "Please set it to the name of the queue." - ) - - setup_root_logging() - consume(mq_uri, queue_name) - - -if __name__ == "__main__": - main() diff --git a/src-docs/charm.md b/src-docs/charm.md index 9fd2aac04..fa38de542 100644 --- a/src-docs/charm.md +++ b/src-docs/charm.md @@ -20,7 +20,7 @@ Charm for creating and managing GitHub self-hosted runner instances. --- - + ## function `catch_charm_errors` @@ -46,7 +46,7 @@ Catch common errors in charm. --- - + ## function `catch_action_errors` @@ -72,7 +72,7 @@ Catch common errors in actions. --- - + ## class `ReconcileRunnersEvent` Event representing a periodic check to ensure runners are ok. @@ -83,7 +83,7 @@ Event representing a periodic check to ensure runners are ok. --- - + ## class `GithubRunnerCharm` Charm for managing GitHub self-hosted runners. @@ -100,7 +100,7 @@ Charm for managing GitHub self-hosted runners. - `ram_pool_path`: The path to memdisk storage. - `kernel_module_path`: The path to kernel modules. - + ### method `__init__` diff --git a/src-docs/charm_state.py.md b/src-docs/charm_state.md similarity index 71% rename from src-docs/charm_state.py.md rename to src-docs/charm_state.md index 5783f6821..9b4889d5e 100644 --- a/src-docs/charm_state.py.md +++ b/src-docs/charm_state.md @@ -2,7 +2,7 @@ -# module `charm_state.py` +# module `charm_state` State of the Charm. **Global Variables** @@ -33,49 +33,100 @@ State of the Charm. - **COS_AGENT_INTEGRATION_NAME** - **DEBUG_SSH_INTEGRATION_NAME** - **IMAGE_INTEGRATION_NAME** +- **MONGO_DB_INTEGRATION_NAME** - **LTS_IMAGE_VERSION_TAG_MAP** + +--- + + + +## class `AnyHttpsUrl` +Represents an HTTPS URL. + + + +**Attributes:** + + - `allowed_schemes`: Allowed schemes for the URL. + + + + + --- - + + +## class `GithubConfig` +Charm configuration related to GitHub. -## function `parse_github_path` + + +**Attributes:** + + - `token`: The Github API access token (PAT). + - `path`: The Github org/repo path. + + + +### method `__init__` ```python -parse_github_path(path_str: str, runner_group: str) → GithubOrg | GithubRepo +__init__(token: str, path: GitHubOrg | GitHubRepo) → None ``` -Parse GitHub path. + + + + + + + +--- + + + +### classmethod `from_charm` + +```python +from_charm(charm: CharmBase) → GithubConfig +``` + +Get github related charm configuration values from charm. **Args:** - - `path_str`: GitHub path in string format. - - `runner_group`: Runner group name for GitHub organization. If the path is a repository this argument is ignored. + - `charm`: The charm instance. **Raises:** - - `CharmConfigInvalidError`: if an invalid path string was given. + - `CharmConfigInvalidError`: If an invalid configuration value was set. **Returns:** - GithubPath object representing the GitHub repository, or the GitHub organization with runner group information. + The parsed GitHub configuration values. --- -## class `AnyHttpsUrl` -Represents an HTTPS URL. + + +## class `VirtualMachineResources` +Virtual machine resource configuration. **Attributes:** - - `allowed_schemes`: Allowed schemes for the URL. + - `cpu`: Number of vCPU for the virtual machine. + - `memory`: Amount of memory for the virtual machine. + - `disk`: Amount of disk for the virtual machine. @@ -83,6 +134,8 @@ Represents an HTTPS URL. --- + + ## class `Arch` Supported system architectures. @@ -99,15 +152,17 @@ Supported system architectures. --- -## class `BaseImage` -The ubuntu OS base image to build and deploy runners on. + + +## class `RunnerStorage` +Supported storage as runner disk. **Attributes:** - - `JAMMY`: The jammy ubuntu LTS image. - - `NOBLE`: The noble ubuntu LTS image. + - `JUJU_STORAGE`: Represents runner storage from Juju storage. + - `MEMORY`: Represents tempfs storage (ramdisk). @@ -115,64 +170,80 @@ The ubuntu OS base image to build and deploy runners on. --- -## class `CharmConfig` -General charm configuration. + -Some charm configurations are grouped into other configuration models. +## class `InstanceType` +Type of instance for runner. **Attributes:** - - `denylist`: List of IPv4 to block the runners from accessing. - - `dockerhub_mirror`: Private docker registry as dockerhub mirror for the runners to use. - - `labels`: Additional runner labels to append to default (i.e. os, flavor, architecture). - - `openstack_clouds_yaml`: The openstack clouds.yaml configuration. - - `path`: GitHub repository path in the format '/', or the GitHub organization name. - - `reconcile_interval`: Time between each reconciliation of runners in minutes. - - `repo_policy_compliance`: Configuration for the repo policy compliance service. - - `token`: GitHub personal access token for GitHub API. + - `LOCAL_LXD`: LXD instance on the local juju machine. + - `OPENSTACK`: OpenStack instance on a cloud. + --- - + + +## class `CharmConfigInvalidError` +Raised when charm config is invalid. + -### classmethod `check_reconcile_interval` + +**Attributes:** + + - `msg`: Explanation of the error. + + + +### method `__init__` ```python -check_reconcile_interval(reconcile_interval: int) → int +__init__(msg: str) ``` -Validate the general charm configuration. +Initialize a new instance of the CharmConfigInvalidError exception. **Args:** - - `reconcile_interval`: The value of reconcile_interval passed to class instantiation. + - `msg`: Explanation of the error. -**Raises:** + + +--- + + + +## class `RepoPolicyComplianceConfig` +Configuration for the repo policy compliance service. + + + +**Attributes:** - - `ValueError`: if an invalid reconcile_interval value of less than 2 has been passed. + - `token`: Token for the repo policy compliance service. + - `url`: URL of the repo policy compliance service. -**Returns:** - The validated reconcile_interval value. --- - + ### classmethod `from_charm` ```python -from_charm(charm: CharmBase) → CharmConfig +from_charm(charm: CharmBase) → RepoPolicyComplianceConfig ``` Initialize the config from charm. @@ -187,121 +258,96 @@ Initialize the config from charm. **Raises:** - - `CharmConfigInvalidError`: If any invalid configuration has been set on the charm. + - `CharmConfigInvalidError`: If an invalid configuration was set. **Returns:** - Current config of the charm. + Current repo-policy-compliance config. --- -## class `CharmConfigInvalidError` -Raised when charm config is invalid. + + +## class `OpenStackCloudsYAML` +The OpenStack clouds YAML dict mapping. **Attributes:** - - `msg`: Explanation of the error. - - - -### function `__init__` - -```python -__init__(msg: str) -``` - -Initialize a new instance of the CharmConfigInvalidError exception. - + - `clouds`: The map of cloud name to cloud connection info. -**Args:** - - - `msg`: Explanation of the error. +--- + ---- +## class `CharmConfig` +General charm configuration. -## class `CharmState` -The charm state. +Some charm configurations are grouped into other configuration models. **Attributes:** - - `arch`: The underlying compute architecture, i.e. x86_64, amd64, arm64/aarch64. - - `charm_config`: Configuration of the juju charm. - - `is_metrics_logging_available`: Whether the charm is able to issue metrics. - - `proxy_config`: Proxy-related configuration. - - `instance_type`: The type of instances, e.g., local lxd, openstack. - - `runner_config`: The charm configuration related to runner VM configuration. - - `ssh_debug_connections`: SSH debug connections configuration information. + - `denylist`: List of IPv4 to block the runners from accessing. + - `dockerhub_mirror`: Private docker registry as dockerhub mirror for the runners to use. + - `labels`: Additional runner labels to append to default (i.e. os, flavor, architecture). + - `openstack_clouds_yaml`: The openstack clouds.yaml configuration. + - `path`: GitHub repository path in the format '/', or the GitHub organization name. + - `reconcile_interval`: Time between each reconciliation of runners in minutes. + - `repo_policy_compliance`: Configuration for the repo policy compliance service. + - `token`: GitHub personal access token for GitHub API. --- - + -### classmethod `from_charm` +### classmethod `check_reconcile_interval` ```python -from_charm(charm: CharmBase) → CharmState +check_reconcile_interval(reconcile_interval: int) → int ``` -Initialize the state from charm. +Validate the general charm configuration. **Args:** - - `charm`: The charm instance. + - `reconcile_interval`: The value of reconcile_interval passed to class instantiation. **Raises:** - - `CharmConfigInvalidError`: If an invalid configuration was set. + - `ValueError`: if an invalid reconcile_interval value of less than 2 has been passed. **Returns:** - Current state of the charm. - - ---- - -## class `GithubConfig` -Charm configuration related to GitHub. - - - -**Attributes:** - - - `token`: The Github API access token (PAT). - - `path`: The Github org/repo path. - - - + The validated reconcile_interval value. --- - + ### classmethod `from_charm` ```python -from_charm(charm: CharmBase) → GithubConfig +from_charm(charm: CharmBase) → CharmConfig ``` -Get github related charm configuration values from charm. +Initialize the config from charm. @@ -313,123 +359,128 @@ Get github related charm configuration values from charm. **Raises:** - - `CharmConfigInvalidError`: If an invalid configuration value was set. + - `CharmConfigInvalidError`: If any invalid configuration has been set on the charm. **Returns:** - The parsed GitHub configuration values. + Current config of the charm. --- -## class `GithubOrg` -Represent GitHub organization. + + +## class `BaseImage` +The ubuntu OS base image to build and deploy runners on. **Attributes:** - - `org`: Name of the GitHub organization. - - `group`: Runner group to spawn the runners in. + - `JAMMY`: The jammy ubuntu LTS image. + - `NOBLE`: The noble ubuntu LTS image. + --- - + -### function `path` +## class `OpenstackImage` +OpenstackImage from image builder relation data. -```python -path() → str -``` -Return a string representing the path. +**Attributes:** + + - `id`: The OpenStack image ID. + - `tags`: Image tags, e.g. jammy -**Returns:** - Path to the GitHub entity. --- -## class `GithubRepo` -Represent GitHub repository. + +### classmethod `from_charm` +```python +from_charm(charm: CharmBase) → OpenstackImage | None +``` -**Attributes:** +Initialize the OpenstackImage info from relation data. + +None represents relation not established. None values for id/tags represent image not yet ready but the relation exists. + + + +**Args:** - - `owner`: Owner of the GitHub repository. - - `repo`: Name of the GitHub repository. + - `charm`: The charm instance. +**Returns:** + OpenstackImage metadata from charm relation data. + --- - + -### function `path` +## class `OpenstackRunnerConfig` +Runner configuration for OpenStack Instances. -```python -path() → str -``` -Return a string representing the path. +**Attributes:** + + - `virtual_machines`: Number of virtual machine-based runner to spawn. + - `openstack_flavor`: flavor on openstack to use for virtual machines. + - `openstack_network`: Network on openstack to use for virtual machines. + - `openstack_image`: Openstack image to use for virtual machines. -**Returns:** - Path to the GitHub entity. --- -## class `ImmutableConfigChangedError` -Represents an error when changing immutable charm state. + - - -### function `__init__` +### classmethod `from_charm` ```python -__init__(msg: str) +from_charm(charm: CharmBase) → OpenstackRunnerConfig ``` -Initialize a new instance of the ImmutableConfigChangedError exception. +Initialize the config from charm. **Args:** - - `msg`: Explanation of the error. - - - - - ---- - -## class `InstanceType` -Type of instance for runner. + - `charm`: The charm instance. -**Attributes:** +**Raises:** - - `LOCAL_LXD`: LXD instance on the local juju machine. - - `OPENSTACK`: OpenStack instance on a cloud. + - `CharmConfigInvalidError`: Error with charm configuration virtual-machines not of int type. +**Returns:** + Openstack runner config of the charm. --- + + ## class `LocalLxdRunnerConfig` Runner configurations for local LXD instances. @@ -447,7 +498,7 @@ Runner configurations for local LXD instances. --- - + ### classmethod `check_virtual_machine_resources` @@ -478,7 +529,7 @@ Validate the virtual_machine_resources field values. --- - + ### classmethod `check_virtual_machines` @@ -507,7 +558,7 @@ Validate the virtual machines configuration value. --- - + ### classmethod `from_charm` @@ -537,73 +588,71 @@ Initialize the config from charm. --- -## class `OpenstackImage` -OpenstackImage from image builder relation data. + + +## class `ProxyConfig` +Proxy configuration. **Attributes:** - - `id`: The OpenStack image ID. - - `tags`: Image tags, e.g. jammy + - `aproxy_address`: The address of aproxy snap instance if use_aproxy is enabled. + - `http`: HTTP proxy address. + - `https`: HTTPS proxy address. + - `no_proxy`: Comma-separated list of hosts that should not be proxied. + - `use_aproxy`: Whether aproxy should be used for the runners. +--- + +#### property aproxy_address + +Return the aproxy address. + --- - + -### classmethod `from_charm` +### classmethod `check_use_aproxy` ```python -from_charm(charm: CharmBase) → OpenstackImage | None +check_use_aproxy(use_aproxy: bool, values: dict) → bool ``` -Initialize the OpenstackImage info from relation data. - -None represents relation not established. None values for id/tags represent image not yet ready but the relation exists. +Validate the proxy configuration. **Args:** - - `charm`: The charm instance. - - - -**Returns:** - OpenstackImage metadata from charm relation data. - - ---- - -## class `OpenstackRunnerConfig` -Runner configuration for OpenStack Instances. + - `use_aproxy`: Value of use_aproxy variable. + - `values`: Values in the pydantic model. -**Attributes:** +**Raises:** - - `virtual_machines`: Number of virtual machine-based runner to spawn. - - `openstack_flavor`: flavor on openstack to use for virtual machines. - - `openstack_network`: Network on openstack to use for virtual machines. - - `openstack_image`: Openstack image to use for virtual machines. + - `ValueError`: if use_aproxy was set but no http/https was passed. +**Returns:** + Validated use_aproxy value. --- - + ### classmethod `from_charm` ```python -from_charm(charm: CharmBase) → OpenstackRunnerConfig +from_charm(charm: CharmBase) → ProxyConfig ``` -Initialize the config from charm. +Initialize the proxy config from charm. @@ -613,81 +662,73 @@ Initialize the config from charm. -**Raises:** - - - `CharmConfigInvalidError`: Error with charm configuration virtual-machines not of int type. - - - **Returns:** - Openstack runner config of the charm. + Current proxy config of the charm. --- -## class `ProxyConfig` -Proxy configuration. + + +## class `UnsupportedArchitectureError` +Raised when given machine charm architecture is unsupported. **Attributes:** - - `aproxy_address`: The address of aproxy snap instance if use_aproxy is enabled. - - `http`: HTTP proxy address. - - `https`: HTTPS proxy address. - - `no_proxy`: Comma-separated list of hosts that should not be proxied. - - `use_aproxy`: Whether aproxy should be used for the runners. + - `arch`: The current machine architecture. + ---- +### method `__init__` -#### property aproxy_address +```python +__init__(arch: str) → None +``` -Return the aproxy address. +Initialize a new instance of the CharmConfigInvalidError exception. ---- +**Args:** + + - `arch`: The current machine architecture. - -### classmethod `check_use_aproxy` -```python -check_use_aproxy(use_aproxy: bool, values: dict) → bool -``` -Validate the proxy configuration. +--- + -**Args:** - - - `use_aproxy`: Value of use_aproxy variable. - - `values`: Values in the pydantic model. +## class `SSHDebugConnection` +SSH connection information for debug workflow. -**Raises:** +**Attributes:** - - `ValueError`: if use_aproxy was set but no http/https was passed. + - `host`: The SSH relay server host IP address inside the VPN. + - `port`: The SSH relay server port. + - `rsa_fingerprint`: The host SSH server public RSA key fingerprint. + - `ed25519_fingerprint`: The host SSH server public ed25519 key fingerprint. -**Returns:** - Validated use_aproxy value. --- - + ### classmethod `from_charm` ```python -from_charm(charm: CharmBase) → ProxyConfig +from_charm(charm: CharmBase) → list['SSHDebugConnection'] ``` -Initialize the proxy config from charm. +Initialize the SSHDebugInfo from charm relation data. @@ -698,65 +739,77 @@ Initialize the proxy config from charm. **Returns:** - Current proxy config of the charm. + List of connection information for ssh debug access. --- -## class `RepoPolicyComplianceConfig` -Configuration for the repo policy compliance service. + + +## class `ReactiveConfig` +Represents the configuration for reactive scheduling. **Attributes:** - - `token`: Token for the repo policy compliance service. - - `url`: URL of the repo policy compliance service. + - `mq_uri`: The URI of the MQ to use to spawn runners reactively. --- - + -### classmethod `from_charm` +### classmethod `from_database` ```python -from_charm(charm: CharmBase) → RepoPolicyComplianceConfig +from_database(database: DatabaseRequires) → ReactiveConfig | None ``` -Initialize the config from charm. +Initialize the ReactiveConfig from charm config and integration data. **Args:** - - `charm`: The charm instance. + - `database`: The database to fetch integration data from. + + + +**Returns:** + The connection information for the reactive MQ or None if not available. **Raises:** - - `CharmConfigInvalidError`: If an invalid configuration was set. + - `MissingMongoDBError`: If the information on howto access MongoDB is missing in the integration data. +--- -**Returns:** - Current repo-policy-compliance config. + +## class `ImmutableConfigChangedError` +Represents an error when changing immutable charm state. ---- + -## class `RunnerStorage` -Supported storage as runner disk. +### method `__init__` +```python +__init__(msg: str) +``` +Initialize a new instance of the ImmutableConfigChangedError exception. -**Attributes:** + + +**Args:** - - `JUJU_STORAGE`: Represents runner storage from Juju storage. - - `MEMORY`: Represents tempfs storage (ramdisk). + - `msg`: Explanation of the error. @@ -764,90 +817,76 @@ Supported storage as runner disk. --- -## class `SSHDebugConnection` -SSH connection information for debug workflow. + + +## class `CharmState` +The charm state. **Attributes:** - - `host`: The SSH relay server host IP address inside the VPN. - - `port`: The SSH relay server port. - - `rsa_fingerprint`: The host SSH server public RSA key fingerprint. - - `ed25519_fingerprint`: The host SSH server public ed25519 key fingerprint. - - - - ---- + - `arch`: The underlying compute architecture, i.e. x86_64, amd64, arm64/aarch64. + - `charm_config`: Configuration of the juju charm. + - `is_metrics_logging_available`: Whether the charm is able to issue metrics. + - `proxy_config`: Proxy-related configuration. + - `instance_type`: The type of instances, e.g., local lxd, openstack. + - `reactive_config`: The charm configuration related to reactive spawning mode. + - `runner_config`: The charm configuration related to runner VM configuration. + - `ssh_debug_connections`: SSH debug connections configuration information. - + -### classmethod `from_charm` +### method `__init__` ```python -from_charm(charm: CharmBase) → list['SSHDebugConnection'] +__init__( + arch: Arch, + is_metrics_logging_available: bool, + proxy_config: ProxyConfig, + instance_type: InstanceType, + charm_config: CharmConfig, + runner_config: OpenstackRunnerConfig | LocalLxdRunnerConfig, + reactive_config: ReactiveConfig | None, + ssh_debug_connections: list[SSHDebugConnection] +) → None ``` -Initialize the SSHDebugInfo from charm relation data. -**Args:** - - - `charm`: The charm instance. - -**Returns:** - List of connection information for ssh debug access. --- -## class `UnsupportedArchitectureError` -Raised when given machine charm architecture is unsupported. - + - -**Attributes:** - - - `arch`: The current machine architecture. - - - -### function `__init__` +### classmethod `from_charm` ```python -__init__(arch: str) → None +from_charm(charm: CharmBase, database: DatabaseRequires) → CharmState ``` -Initialize a new instance of the CharmConfigInvalidError exception. +Initialize the state from charm. **Args:** - - `arch`: The current machine architecture. - - - - - ---- - -## class `VirtualMachineResources` -Virtual machine resource configuration. + - `charm`: The charm instance. + - `database`: The database instance. -**Attributes:** +**Raises:** - - `cpu`: Number of vCPU for the virtual machine. - - `memory`: Amount of memory for the virtual machine. - - `disk`: Amount of disk for the virtual machine. + - `CharmConfigInvalidError`: If an invalid configuration was set. +**Returns:** + Current state of the charm. diff --git a/src-docs/errors.md b/src-docs/errors.md index ee5db5a11..c61dd8410 100644 --- a/src-docs/errors.md +++ b/src-docs/errors.md @@ -7,39 +7,6 @@ Errors used by the charm. ---- - - - -## class `RunnerError` -Generic runner error as base exception. - - - - - ---- - - - -## class `RunnerExecutionError` -Error for executing commands on runner. - - - - - ---- - - - -## class `RunnerFileLoadError` -Error for loading file on runner. - - - - - --- @@ -55,8 +22,8 @@ Error for runner creation failure. -## class `RunnerRemoveError` -Error for runner removal failure. +## class `RunnerFileLoadError` +Error for loading file on runner. @@ -66,8 +33,8 @@ Error for runner removal failure. -## class `RunnerStartError` -Error for runner start failure. +## class `RunnerRemoveError` +Error for runner removal failure. @@ -220,17 +187,6 @@ Represents an error raised when logrotate cannot be setup. -## class `MetricsStorageError` -Base class for all metrics storage errors. - - - - - ---- - - - ## class `SharedFilesystemError` Base class for all shared filesystem errors. @@ -240,51 +196,7 @@ Base class for all shared filesystem errors. --- - - -## class `CreateMetricsStorageError` -Represents an error when the metrics storage could not be created. - - - - - ---- - - - -## class `DeleteMetricsStorageError` -Represents an error when the metrics storage could not be deleted. - - - - - ---- - - - -## class `GetMetricsStorageError` -Represents an error when the metrics storage could not be retrieved. - - - - - ---- - - - -## class `QuarantineMetricsStorageError` -Represents an error when the metrics storage could not be quarantined. - - - - - ---- - - + ## class `SharedFilesystemMountError` Represents an error related to the mounting of the shared filesystem. @@ -295,84 +207,7 @@ Represents an error related to the mounting of the shared filesystem. --- - - -## class `RunnerMetricsError` -Base class for all runner metrics errors. - - - - - ---- - - - -## class `CorruptMetricDataError` -Represents an error with the data being corrupt. - - - - - ---- - - - -## class `GithubMetricsError` -Base class for all github metrics errors. - - - - - ---- - - - -## class `GithubClientError` -Base class for all github client errors. - - - - - ---- - - - -## class `GithubApiError` -Represents an error when the GitHub API returns an error. - - - - - ---- - - - -## class `TokenError` -Represents an error when the token is invalid or has not enough permissions. - - - - - ---- - - - -## class `JobNotFoundError` -Represents an error when the job could not be found on GitHub. - - - - - ---- - - + ## class `RunnerLogsError` Base class for all runner logs errors. @@ -381,58 +216,3 @@ Base class for all runner logs errors. ---- - - - -## class `OpenStackError` -Base class for OpenStack errors. - - - - - ---- - - - -## class `OpenStackInvalidConfigError` -Represents an invalid OpenStack configuration. - - - - - ---- - - - -## class `OpenStackUnauthorizedError` -Represents an unauthorized connection to OpenStack. - - - - - ---- - - - -## class `SSHError` -Represents an error while interacting with SSH. - - - - - ---- - - - -## class `KeyfileError` -Represents missing keyfile for SSH. - - - - - diff --git a/src-docs/github_client.md b/src-docs/github_client.md index fc0de8f7b..679c9f907 100644 --- a/src-docs/github_client.md +++ b/src-docs/github_client.md @@ -8,116 +8,20 @@ GitHub API client. Migrate to PyGithub in the future. PyGithub is still lacking some API such as remove token for runner. ---- - - - -## function `catch_http_errors` - -```python -catch_http_errors( - func: Callable[~ParamT, ~ReturnT] -) → Callable[~ParamT, ~ReturnT] -``` - -Catch HTTP errors and raise custom exceptions. - - - -**Args:** - - - `func`: The target function to catch common errors for. - - - -**Returns:** - The decorated function. - --- - + ## class `GithubClient` GitHub API client. - - -### method `__init__` - -```python -__init__(token: str) -``` - -Instantiate the GiHub API client. - - - -**Args:** - - - `token`: GitHub personal token for API requests. - --- - - -### method `delete_runner` - -```python -delete_runner(path: GitHubOrg | GitHubRepo, runner_id: int) → None -``` - -Delete the self-hosted runner from GitHub. - - - -**Args:** - - - `path`: GitHub repository path in the format '/', or the GitHub organization name. - - `runner_id`: Id of the runner. - ---- - - - -### method `get_job_info` - -```python -get_job_info( - path: GitHubRepo, - workflow_run_id: str, - runner_name: str -) → JobStats -``` - -Get information about a job for a specific workflow run. - - - -**Args:** - - - `path`: GitHub repository path in the format '/'. - - `workflow_run_id`: Id of the workflow run. - - `runner_name`: Name of the runner. - - - -**Raises:** - - - `TokenError`: if there was an error with the Github token crdential provided. - - `JobNotFoundError`: If no jobs were found. - - - -**Returns:** - Job information. - ---- - - + ### method `get_runner_application` @@ -150,73 +54,4 @@ Get runner application available for download for given arch. **Returns:** The runner application. ---- - - - -### method `get_runner_github_info` - -```python -get_runner_github_info(path: GitHubOrg | GitHubRepo) → list[SelfHostedRunner] -``` - -Get runner information on GitHub under a repo or org. - - - -**Args:** - - - `path`: GitHub repository path in the format '/', or the GitHub organization name. - - - -**Returns:** - List of runner information. - ---- - - - -### method `get_runner_registration_token` - -```python -get_runner_registration_token(path: GitHubOrg | GitHubRepo) → str -``` - -Get token from GitHub used for registering runners. - - - -**Args:** - - - `path`: GitHub repository path in the format '/', or the GitHub organization name. - - - -**Returns:** - The registration token. - ---- - - - -### method `get_runner_remove_token` - -```python -get_runner_remove_token(path: GitHubOrg | GitHubRepo) → str -``` - -Get token from GitHub used for removing runners. - - - -**Args:** - - - `path`: The Github org/repo path. - - - -**Returns:** - The removing token. - diff --git a/src-docs/runner.md b/src-docs/runner.md index b513ad697..d7bfb93c1 100644 --- a/src-docs/runner.md +++ b/src-docs/runner.md @@ -17,7 +17,7 @@ The `RunnerManager` class from `runner_manager.py` creates and manages a collect --- - + ## class `Snap` This class represents a snap installation. @@ -36,7 +36,7 @@ This class represents a snap installation. --- - + ## class `WgetExecutable` The executable to be installed through wget. @@ -66,7 +66,7 @@ __init__(url: str, cmd: str) → None --- - + ## class `CreateRunnerConfig` The configuration values for creating a single runner instance. @@ -105,7 +105,7 @@ __init__( --- - + ## class `Runner` Single instance of GitHub self-hosted runner. @@ -120,7 +120,7 @@ Single instance of GitHub self-hosted runner. - `runner_script`: The runner start script file path. - `pre_job_script`: The runner pre_job script file path. This is referenced in the env_file in the ACTIONS_RUNNER_HOOK_JOB_STARTED environment variable. - + ### method `__init__` @@ -149,7 +149,7 @@ Construct the runner instance. --- - + ### method `create` @@ -173,7 +173,7 @@ Create the runner instance on LXD and register it on GitHub. --- - + ### method `pull_logs` @@ -193,7 +193,7 @@ Expects the runner to have an instance. --- - + ### method `remove` diff --git a/src-docs/runner_manager.md b/src-docs/runner_manager.md index f52829efa..883745753 100644 --- a/src-docs/runner_manager.md +++ b/src-docs/runner_manager.md @@ -52,7 +52,7 @@ Construct RunnerManager object for creating and managing runners. --- - + ### method `build_runner_image` @@ -135,7 +135,7 @@ Get information on the runners from GitHub. --- - + ### method `get_latest_runner_bin_url` @@ -219,7 +219,7 @@ Install cron job for building runner image. --- - + ### method `update_runner_bin` diff --git a/src-docs/runner_type.md b/src-docs/runner_type.md index d5029f4f8..cde5b2a7e 100644 --- a/src-docs/runner_type.md +++ b/src-docs/runner_type.md @@ -9,7 +9,7 @@ Types used by Runner class. --- - + ## class `RunnerNameByHealth` Set of runners instance by health state. @@ -39,7 +39,7 @@ __init__(healthy: tuple[str, ], unhealthy: tuple[str, ]) → None --- - + ## class `ProxySetting` Represent HTTP-related proxy settings. @@ -76,7 +76,7 @@ __init__( --- - + ## class `RunnerConfig` Configuration for runner. @@ -123,7 +123,7 @@ __init__( --- - + ## class `RunnerStatus` Status of runner. @@ -160,7 +160,7 @@ __init__( --- - + ## class `RunnerGithubInfo` GitHub info of a runner. diff --git a/src-docs/shared_fs.md b/src-docs/shared_fs.md index 004556f9f..5ae59a8ca 100644 --- a/src-docs/shared_fs.md +++ b/src-docs/shared_fs.md @@ -13,7 +13,7 @@ Classes and functions to operate on the shared filesystem between the charm and --- - + ## function `create` @@ -45,7 +45,7 @@ The method is not idempotent and will raise an exception if the shared filesyste --- - + ## function `list_all` @@ -63,7 +63,7 @@ List all the metric storages. --- - + ## function `get` @@ -95,7 +95,7 @@ Mounts the filesystem if it is not currently mounted. --- - + ## function `delete` diff --git a/src-docs/utilities.md b/src-docs/utilities.md index 6c2aab4e1..b2c4cbf21 100644 --- a/src-docs/utilities.md +++ b/src-docs/utilities.md @@ -8,77 +8,7 @@ Utilities used by the charm. --- - - -## function `retry` - -```python -retry( - exception: Type[Exception] = , - tries: int = 1, - delay: float = 0, - max_delay: Optional[float] = None, - backoff: float = 1, - local_logger: Logger = -) → Callable[[Callable[~ParamT, ~ReturnT]], Callable[~ParamT, ~ReturnT]] -``` - -Parameterize the decorator for adding retry to functions. - - - -**Args:** - - - `exception`: Exception type to be retried. - - `tries`: Number of attempts at retry. - - `delay`: Time in seconds to wait between retry. - - `max_delay`: Max time in seconds to wait between retry. - - `backoff`: Factor to increase the delay by each retry. - - `local_logger`: Logger for logging. - - - -**Returns:** - The function decorator for retry. - - ---- - - - -## function `secure_run_subprocess` - -```python -secure_run_subprocess( - cmd: Sequence[str], - hide_cmd: bool = False, - **kwargs: dict[str, Any] -) → CompletedProcess[bytes] -``` - -Run command in subprocess according to security recommendations. - -CalledProcessError will not be raised on error of the command executed. Errors should be handled by the caller by checking the exit code. - -The command is executed with `subprocess.run`, additional arguments can be passed to it as keyword arguments. The following arguments to `subprocess.run` should not be set: `capture_output`, `shell`, `check`. As those arguments are used by this function. - - - -**Args:** - - - `cmd`: Command in a list. - - `hide_cmd`: Hide logging of cmd. - - `kwargs`: Additional keyword arguments for the `subprocess.run` call. - - - -**Returns:** - Object representing the completed process. The outputs subprocess can accessed. - - ---- - - + ## function `execute_command` @@ -118,7 +48,7 @@ The output is logged if the log level of the logger is set to debug. --- - + ## function `get_env_var` @@ -144,29 +74,7 @@ Looks for all upper-case and all low-case of the `env_var`. --- - - -## function `set_env_var` - -```python -set_env_var(env_var: str, value: str) → None -``` - -Set the environment variable value. - -Set the all upper case and all low case of the `env_var`. - - - -**Args:** - - - `env_var`: Name of the environment variable. - - `value`: Value to set environment variable to. - - ---- - - + ## function `bytes_with_unit_to_kib` @@ -196,7 +104,7 @@ Convert a positive integer followed by a unit to number of kibibytes. --- - + ## function `remove_residual_venv_dirs` diff --git a/src/charm.py b/src/charm.py index c60c62bea..0b17dbf52 100755 --- a/src/charm.py +++ b/src/charm.py @@ -7,10 +7,23 @@ # pylint: disable=too-many-lines """Charm for creating and managing GitHub self-hosted runner instances.""" +from github_runner_manager.manager.cloud_runner_manager import ( + GitHubRunnerConfig, + SupportServiceConfig, +) +from github_runner_manager.manager.runner_manager import ( + FlushMode, + RunnerManager, + RunnerManagerConfig, +) +from github_runner_manager.manager.runner_scaler import RunnerScaler +from github_runner_manager.openstack_cloud.openstack_runner_manager import ( + OpenStackCloudConfig, + OpenStackRunnerManager, + OpenStackServerConfig, +) +from github_runner_manager.types_.github import GitHubPath, GitHubRunnerStatus, parse_github_path -from manager.cloud_runner_manager import GitHubRunnerConfig, SupportServiceConfig -from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig -from manager.runner_scaler import RunnerScaler from utilities import bytes_with_unit_to_kib, execute_command, remove_residual_venv_dirs, retry # This is a workaround for https://bugs.launchpad.net/juju/+bug/2058335 @@ -59,20 +72,17 @@ TOKEN_CONFIG_NAME, CharmConfigInvalidError, CharmState, - GitHubPath, InstanceType, OpenstackImage, ProxyConfig, RunnerStorage, VirtualMachineResources, - parse_github_path, ) from errors import ( ConfigurationError, LogrotateSetupError, MissingMongoDBError, MissingRunnerBinaryError, - OpenStackUnauthorizedError, RunnerBinaryError, RunnerError, SubprocessError, @@ -80,12 +90,6 @@ ) from event_timer import EventTimer, TimerStatusError from firewall import Firewall, FirewallEntry -from github_type import GitHubRunnerStatus -from openstack_cloud.openstack_runner_manager import ( - OpenStackCloudConfig, - OpenStackRunnerManager, - OpenStackServerConfig, -) from runner import LXD_PROFILE_YAML from runner_manager import LXDRunnerManager, LXDRunnerManagerConfig from runner_manager_type import LXDFlushMode @@ -140,11 +144,6 @@ def func_with_catch_errors(self: "GithubRunnerCharm", event: EventT) -> None: "GitHub runner application not downloaded; the charm will retry download on " "reconcile interval" ) - except OpenStackUnauthorizedError: - logger.exception("Unauthorized OpenStack connection") - self.unit.status = BlockedStatus( - "Unauthorized OpenStack connection. Check credentials." - ) except MissingMongoDBError as err: logger.exception("Missing integration data") self.unit.status = WaitingStatus(str(err)) diff --git a/src/charm_state.py b/src/charm_state.py index dcd87d122..6ae46d386 100644 --- a/src/charm_state.py +++ b/src/charm_state.py @@ -19,6 +19,9 @@ import yaml from charms.data_platform_libs.v0.data_interfaces import DatabaseRequires +from github_runner_manager import openstack_cloud +from github_runner_manager.errors import OpenStackInvalidConfigError +from github_runner_manager.types_.github import GitHubPath, parse_github_path from ops import CharmBase from pydantic import ( AnyHttpUrl, @@ -31,8 +34,7 @@ validator, ) -import openstack_cloud -from errors import MissingMongoDBError, OpenStackInvalidConfigError +from errors import MissingMongoDBError from firewall import FirewallEntry from utilities import get_env_var @@ -87,75 +89,6 @@ class AnyHttpsUrl(AnyHttpUrl): allowed_schemes = {"https"} -@dataclasses.dataclass -class GitHubRepo: - """Represent GitHub repository. - - Attributes: - owner: Owner of the GitHub repository. - repo: Name of the GitHub repository. - """ - - owner: str - repo: str - - def path(self) -> str: - """Return a string representing the path. - - Returns: - Path to the GitHub entity. - """ - return f"{self.owner}/{self.repo}" - - -@dataclasses.dataclass -class GitHubOrg: - """Represent GitHub organization. - - Attributes: - org: Name of the GitHub organization. - group: Runner group to spawn the runners in. - """ - - org: str - group: str - - def path(self) -> str: - """Return a string representing the path. - - Returns: - Path to the GitHub entity. - """ - return self.org - - -GitHubPath = GitHubOrg | GitHubRepo - - -def parse_github_path(path_str: str, runner_group: str) -> GitHubPath: - """Parse GitHub path. - - Args: - path_str: GitHub path in string format. - runner_group: Runner group name for GitHub organization. If the path is - a repository this argument is ignored. - - Raises: - CharmConfigInvalidError: if an invalid path string was given. - - Returns: - GithubPath object representing the GitHub repository, or the GitHub - organization with runner group information. - """ - if "/" in path_str: - paths = tuple(segment for segment in path_str.split("/") if segment) - if len(paths) != 2: - raise CharmConfigInvalidError(f"Invalid path configuration {path_str}") - owner, repo = paths - return GitHubRepo(owner=owner, repo=repo) - return GitHubOrg(org=path_str, group=runner_group) - - @dataclasses.dataclass class GithubConfig: """Charm configuration related to GitHub. @@ -186,7 +119,10 @@ def from_charm(cls, charm: CharmBase) -> "GithubConfig": path_str = cast(str, charm.config.get(PATH_CONFIG_NAME, "")) if not path_str: raise CharmConfigInvalidError(f"Missing {PATH_CONFIG_NAME} configuration") - path = parse_github_path(cast(str, path_str), cast(str, runner_group)) + try: + path = parse_github_path(cast(str, path_str), cast(str, runner_group)) + except ValueError as e: + raise CharmConfigInvalidError(str(e)) from e token = cast(str, charm.config.get(TOKEN_CONFIG_NAME)) if not token: diff --git a/src/errors.py b/src/errors.py index 4285dc6e4..7212b4642 100644 --- a/src/errors.py +++ b/src/errors.py @@ -6,31 +6,31 @@ from typing import Union +# we import the errors from the module, these are used in the charm +from github_runner_manager.errors import ( # noqa: F401 pylint: disable=unused-import + CreateMetricsStorageError, + DeleteMetricsStorageError, + GetMetricsStorageError, + GithubClientError, + GithubMetricsError, + MetricsStorageError, + RunnerError, + TokenError, +) -class RunnerError(Exception): - """Generic runner error as base exception.""" - -class RunnerExecutionError(RunnerError): - """Error for executing commands on runner.""" +class RunnerCreateError(RunnerError): + """Error for runner creation failure.""" class RunnerFileLoadError(RunnerError): """Error for loading file on runner.""" -class RunnerCreateError(RunnerError): - """Error for runner creation failure.""" - - class RunnerRemoveError(RunnerError): """Error for runner removal failure.""" -class RunnerStartError(RunnerError): - """Error for runner start failure.""" - - class RunnerBinaryError(RunnerError): """Error of getting runner binary.""" @@ -100,81 +100,13 @@ class LogrotateSetupError(Exception): """Represents an error raised when logrotate cannot be setup.""" -class MetricsStorageError(Exception): - """Base class for all metrics storage errors.""" - - class SharedFilesystemError(MetricsStorageError): """Base class for all shared filesystem errors.""" -class CreateMetricsStorageError(MetricsStorageError): - """Represents an error when the metrics storage could not be created.""" - - -class DeleteMetricsStorageError(MetricsStorageError): - """Represents an error when the metrics storage could not be deleted.""" - - -class GetMetricsStorageError(MetricsStorageError): - """Represents an error when the metrics storage could not be retrieved.""" - - -class QuarantineMetricsStorageError(MetricsStorageError): - """Represents an error when the metrics storage could not be quarantined.""" - - class SharedFilesystemMountError(SharedFilesystemError): """Represents an error related to the mounting of the shared filesystem.""" -class RunnerMetricsError(Exception): - """Base class for all runner metrics errors.""" - - -class CorruptMetricDataError(RunnerMetricsError): - """Represents an error with the data being corrupt.""" - - -class GithubMetricsError(Exception): - """Base class for all github metrics errors.""" - - -class GithubClientError(Exception): - """Base class for all github client errors.""" - - -class GithubApiError(GithubClientError): - """Represents an error when the GitHub API returns an error.""" - - -class TokenError(GithubClientError): - """Represents an error when the token is invalid or has not enough permissions.""" - - -class JobNotFoundError(GithubClientError): - """Represents an error when the job could not be found on GitHub.""" - - class RunnerLogsError(Exception): """Base class for all runner logs errors.""" - - -class OpenStackError(Exception): - """Base class for OpenStack errors.""" - - -class OpenStackInvalidConfigError(OpenStackError): - """Represents an invalid OpenStack configuration.""" - - -class OpenStackUnauthorizedError(OpenStackError): - """Represents an unauthorized connection to OpenStack.""" - - -class SSHError(Exception): - """Represents an error while interacting with SSH.""" - - -class KeyfileError(SSHError): - """Represents missing keyfile for SSH.""" diff --git a/src/github_client.py b/src/github_client.py index b724b5cdb..b14d3b799 100644 --- a/src/github_client.py +++ b/src/github_client.py @@ -6,27 +6,22 @@ Migrate to PyGithub in the future. PyGithub is still lacking some API such as remove token for runner. """ -import functools import logging -from datetime import datetime -from typing import Callable, ParamSpec, TypeVar -from urllib.error import HTTPError - -from ghapi.all import GhApi, pages -from ghapi.page import paged -from typing_extensions import assert_never - -from charm_state import Arch, GitHubOrg, GitHubPath, GitHubRepo -from errors import GithubApiError, JobNotFoundError, RunnerBinaryError, TokenError -from github_type import ( - JobStats, - RegistrationToken, - RemoveToken, +from typing import ParamSpec, TypeVar + +from github_runner_manager.github_client import GithubClient as GitHubRunnerManagerGitHubClient +from github_runner_manager.github_client import catch_http_errors +from github_runner_manager.types_.github import ( + GitHubOrg, + GitHubPath, + GitHubRepo, RunnerApplication, RunnerApplicationList, - SelfHostedRunner, ) +from charm_state import Arch +from errors import RunnerBinaryError + logger = logging.getLogger(__name__) # Parameters of the function decorated with retry @@ -35,57 +30,9 @@ ReturnT = TypeVar("ReturnT") -def catch_http_errors(func: Callable[ParamT, ReturnT]) -> Callable[ParamT, ReturnT]: - """Catch HTTP errors and raise custom exceptions. - - Args: - func: The target function to catch common errors for. - - Returns: - The decorated function. - """ - - @functools.wraps(func) - def wrapper(*args: ParamT.args, **kwargs: ParamT.kwargs) -> ReturnT: - """Catch common errors when using the GitHub API. - - Args: - args: Placeholder for positional arguments. - kwargs: Placeholder for keyword arguments. - - Raises: - TokenError: If there was an error with the provided token. - GithubApiError: If there was an unexpected error using the GitHub API. - - Returns: - The decorated function. - """ - try: - return func(*args, **kwargs) - except HTTPError as exc: - if exc.code in (401, 403): - if exc.code == 401: - msg = "Invalid token." - else: - msg = "Provided token has not enough permissions or has reached rate-limit." - raise TokenError(msg) from exc - raise GithubApiError from exc - - return wrapper - - -class GithubClient: +class GithubClient(GitHubRunnerManagerGitHubClient): """GitHub API client.""" - def __init__(self, token: str): - """Instantiate the GiHub API client. - - Args: - token: GitHub personal token for API requests. - """ - self._token = token - self._client = GhApi(token=self._token) - @catch_http_errors def get_runner_application( self, path: GitHubPath, arch: Arch, os: str = "linux" @@ -125,176 +72,3 @@ def get_runner_application( raise RunnerBinaryError( f"Unable query GitHub runner binary information for {os} {arch}" ) from err - - @catch_http_errors - def get_runner_github_info(self, path: GitHubPath) -> list[SelfHostedRunner]: - """Get runner information on GitHub under a repo or org. - - Args: - path: GitHub repository path in the format '/', or the GitHub organization - name. - - Returns: - List of runner information. - """ - remote_runners_list: list[SelfHostedRunner] = [] - - if isinstance(path, GitHubRepo): - # The documentation of ghapi for pagination is incorrect and examples will give errors. - # This workaround is a temp solution. Will be moving to PyGitHub in the future. - self._client.actions.list_self_hosted_runners_for_repo( - owner=path.owner, repo=path.repo, per_page=100 - ) - num_of_pages = self._client.last_page() - remote_runners_list = [ - item - for page in pages( - self._client.actions.list_self_hosted_runners_for_repo, - num_of_pages + 1, - owner=path.owner, - repo=path.repo, - per_page=100, - ) - for item in page["runners"] - ] - if isinstance(path, GitHubOrg): - # The documentation of ghapi for pagination is incorrect and examples will give errors. - # This workaround is a temp solution. Will be moving to PyGitHub in the future. - self._client.actions.list_self_hosted_runners_for_org(org=path.org, per_page=100) - num_of_pages = self._client.last_page() - remote_runners_list = [ - item - for page in pages( - self._client.actions.list_self_hosted_runners_for_org, - num_of_pages + 1, - org=path.org, - per_page=100, - ) - for item in page["runners"] - ] - return remote_runners_list - - @catch_http_errors - def get_runner_remove_token(self, path: GitHubPath) -> str: - """Get token from GitHub used for removing runners. - - Args: - path: The Github org/repo path. - - Returns: - The removing token. - """ - token: RemoveToken - if isinstance(path, GitHubRepo): - token = self._client.actions.create_remove_token_for_repo( - owner=path.owner, repo=path.repo - ) - elif isinstance(path, GitHubOrg): - token = self._client.actions.create_remove_token_for_org(org=path.org) - else: - assert_never(token) - - return token["token"] - - @catch_http_errors - def get_runner_registration_token(self, path: GitHubPath) -> str: - """Get token from GitHub used for registering runners. - - Args: - path: GitHub repository path in the format '/', or the GitHub organization - name. - - Returns: - The registration token. - """ - token: RegistrationToken - if isinstance(path, GitHubRepo): - token = self._client.actions.create_registration_token_for_repo( - owner=path.owner, repo=path.repo - ) - elif isinstance(path, GitHubOrg): - token = self._client.actions.create_registration_token_for_org(org=path.org) - else: - assert_never(token) - - return token["token"] - - @catch_http_errors - def delete_runner(self, path: GitHubPath, runner_id: int) -> None: - """Delete the self-hosted runner from GitHub. - - Args: - path: GitHub repository path in the format '/', or the GitHub organization - name. - runner_id: Id of the runner. - """ - if isinstance(path, GitHubRepo): - self._client.actions.delete_self_hosted_runner_from_repo( - owner=path.owner, - repo=path.repo, - runner_id=runner_id, - ) - if isinstance(path, GitHubOrg): - self._client.actions.delete_self_hosted_runner_from_org( - org=path.org, - runner_id=runner_id, - ) - - def get_job_info(self, path: GitHubRepo, workflow_run_id: str, runner_name: str) -> JobStats: - """Get information about a job for a specific workflow run. - - Args: - path: GitHub repository path in the format '/'. - workflow_run_id: Id of the workflow run. - runner_name: Name of the runner. - - Raises: - TokenError: if there was an error with the Github token crdential provided. - JobNotFoundError: If no jobs were found. - - Returns: - Job information. - """ - paged_kwargs = {"owner": path.owner, "repo": path.repo, "run_id": workflow_run_id} - try: - for wf_run_page in paged( - self._client.actions.list_jobs_for_workflow_run, **paged_kwargs - ): - jobs = wf_run_page["jobs"] - # ghapi performs endless pagination, - # so we have to break out of the loop if there are no more jobs - if not jobs: - break - for job in jobs: - if job["runner_name"] == runner_name: - # datetime strings should be in ISO 8601 format, - # but they can also use Z instead of - # +00:00, which is not supported by datetime.fromisoformat - created_at = datetime.fromisoformat( - job["created_at"].replace("Z", "+00:00") - ) - started_at = datetime.fromisoformat( - job["started_at"].replace("Z", "+00:00") - ) - # conclusion could be null per api schema, so we need to handle that - # though we would assume that it should always be present, - # as the job should be finished - conclusion = job.get("conclusion", None) - - job_id = job["id"] - return JobStats( - job_id=job_id, - created_at=created_at, - started_at=started_at, - conclusion=conclusion, - ) - - except HTTPError as exc: - if exc.code in (401, 403): - raise TokenError from exc - raise JobNotFoundError( - f"Could not find job for runner {runner_name}. " - f"Could not list jobs for workflow run {workflow_run_id}" - ) from exc - - raise JobNotFoundError(f"Could not find job for runner {runner_name}.") diff --git a/src/github_type.py b/src/github_type.py deleted file mode 100644 index a26a0279a..000000000 --- a/src/github_type.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Return type for the GitHub web API.""" - - -from __future__ import annotations - -from datetime import datetime -from enum import Enum -from typing import List, Literal, Optional, TypedDict - -from pydantic import BaseModel -from typing_extensions import NotRequired - - -class GitHubRunnerStatus(str, Enum): - """Status of runner on GitHub. - - Attributes: - ONLINE: Represents an online runner status. - OFFLINE: Represents an offline runner status. - """ - - ONLINE = "online" - OFFLINE = "offline" - - -# See response schema for -# https://docs.github.com/en/rest/actions/self-hosted-runners?apiVersion=2022-11-28#list-runner-applications-for-an-organization -class RunnerApplication(TypedDict, total=False): - """Information on the runner application. - - Attributes: - os: Operating system to run the runner application on. - architecture: Computer Architecture to run the runner application on. - download_url: URL to download the runner application. - filename: Filename of the runner application. - temp_download_token: A short lived bearer token used to download the - runner, if needed. - sha256_checksum: SHA256 Checksum of the runner application. - """ - - os: Literal["linux", "win", "osx"] - architecture: Literal["arm", "arm64", "x64"] - download_url: str - filename: str - temp_download_token: NotRequired[str] - sha256_checksum: NotRequired[str] - - -RunnerApplicationList = List[RunnerApplication] - - -class SelfHostedRunnerLabel(TypedDict, total=False): - """A single label of self-hosted runners. - - Attributes: - id: Unique identifier of the label. - name: Name of the label. - type: Type of label. Read-only labels are applied automatically when - the runner is configured. - """ - - id: NotRequired[int] - name: str - type: NotRequired[str] - - -class SelfHostedRunner(TypedDict): - """Information on a single self-hosted runner. - - Attributes: - busy: Whether the runner is executing a job. - id: Unique identifier of the runner. - labels: Labels of the runner. - os: Operation system of the runner. - name: Name of the runner. - status: The Github runner status. - """ - - busy: bool - id: int - labels: list[SelfHostedRunnerLabel] - os: str - name: str - status: GitHubRunnerStatus - - -class SelfHostedRunnerList(TypedDict): - """Information on a collection of self-hosted runners. - - Attributes: - total_count: Total number of runners. - runners: List of runners. - """ - - total_count: int - runners: list[SelfHostedRunner] - - -class RegistrationToken(TypedDict): - """Token used for registering GitHub runners. - - Attributes: - token: Token for registering GitHub runners. - expires_at: Time the token expires at. - """ - - token: str - expires_at: str - - -class RemoveToken(TypedDict): - """Token used for removing GitHub runners. - - Attributes: - token: Token for removing GitHub runners. - expires_at: Time the token expires at. - """ - - token: str - expires_at: str - - -class JobConclusion(str, Enum): - """Conclusion of a job on GitHub. - - See :https://docs.github.com/en/rest/actions/workflow-runs?apiVersion=2022-11-28\ -#list-workflow-runs-for-a-repository - - Attributes: - ACTION_REQUIRED: Represents additional action required on the job. - CANCELLED: Represents a cancelled job status. - FAILURE: Represents a failed job status. - NEUTRAL: Represents a job status that can optionally succeed or fail. - SKIPPED: Represents a skipped job status. - SUCCESS: Represents a successful job status. - TIMED_OUT: Represents a job that has timed out. - """ - - ACTION_REQUIRED = "action_required" - CANCELLED = "cancelled" - FAILURE = "failure" - NEUTRAL = "neutral" - SKIPPED = "skipped" - SUCCESS = "success" - TIMED_OUT = "timed_out" - - -class JobStats(BaseModel): - """Stats for a job on GitHub. - - Attributes: - job_id: The ID of the job. - created_at: The time the job was created. - started_at: The time the job was started. - conclusion: The end result of a job. - """ - - job_id: int - created_at: datetime - started_at: datetime - conclusion: Optional[JobConclusion] diff --git a/src/logrotate.py b/src/logrotate.py index 0fd65d5af..294c651dd 100644 --- a/src/logrotate.py +++ b/src/logrotate.py @@ -6,11 +6,11 @@ from pathlib import Path from charms.operator_libs_linux.v1 import systemd +from github_runner_manager.metrics.events import METRICS_LOG_PATH +from github_runner_manager.reactive.runner_manager import REACTIVE_RUNNER_LOG_DIR from pydantic import BaseModel from errors import LogrotateSetupError -from metrics.events import METRICS_LOG_PATH -from reactive.runner_manager import REACTIVE_RUNNER_LOG_DIR LOG_ROTATE_TIMER_SYSTEMD_SERVICE = "logrotate.timer" diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py deleted file mode 100644 index aff75ed41..000000000 --- a/src/manager/cloud_runner_manager.py +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Interface of manager of runner instance on clouds.""" - -import abc -import logging -from dataclasses import dataclass -from enum import Enum, auto -from typing import Iterator, Sequence, Tuple - -from charm_state import GitHubPath, ProxyConfig, RepoPolicyComplianceConfig, SSHDebugConnection -from metrics.runner import RunnerMetrics - -logger = logging.getLogger(__name__) - -InstanceId = str - - -class HealthState(Enum): - """Health state of the runners. - - Attributes: - HEALTHY: The runner is healthy. - UNHEALTHY: The runner is not healthy. - UNKNOWN: Unable to get the health state. - """ - - HEALTHY = auto() - UNHEALTHY = auto() - UNKNOWN = auto() - - -class CloudRunnerState(str, Enum): - """Represent state of the instance hosting the runner. - - Attributes: - CREATED: The instance is created. - ACTIVE: The instance is active and running. - DELETED: The instance is deleted. - ERROR: The instance has encountered error and not running. - STOPPED: The instance has stopped. - UNKNOWN: The state of the instance is not known. - UNEXPECTED: An unknown state not accounted by the developer is encountered. - """ - - CREATED = auto() - ACTIVE = auto() - DELETED = auto() - ERROR = auto() - STOPPED = auto() - UNKNOWN = auto() - UNEXPECTED = auto() - - # Exclude from coverage as not much value for testing this object conversion. - @staticmethod - def from_openstack_server_status( # pragma: no cover - openstack_server_status: str, - ) -> "CloudRunnerState": - """Create from openstack server status. - - The openstack server status are documented here: - https://docs.openstack.org/api-guide/compute/server_concepts.html - - Args: - openstack_server_status: Openstack server status. - - Returns: - The state of the runner. - """ - state = CloudRunnerState.UNEXPECTED - match openstack_server_status: - case "BUILD": - state = CloudRunnerState.CREATED - case "REBUILD": - state = CloudRunnerState.CREATED - case "ACTIVE": - state = CloudRunnerState.ACTIVE - case "ERROR": - state = CloudRunnerState.ERROR - case "STOPPED": - state = CloudRunnerState.STOPPED - case "DELETED": - state = CloudRunnerState.DELETED - case "UNKNOWN": - state = CloudRunnerState.UNKNOWN - case _: - state = CloudRunnerState.UNEXPECTED - return state - - -@dataclass -class GitHubRunnerConfig: - """Configuration for GitHub runner spawned. - - Attributes: - github_path: The GitHub organization or repository for runners to connect to. - labels: The labels to add to runners. - """ - - github_path: GitHubPath - labels: list[str] - - -@dataclass -class SupportServiceConfig: - """Configuration for supporting services for runners. - - Attributes: - proxy_config: The proxy configuration. - dockerhub_mirror: The dockerhub mirror to use for runners. - ssh_debug_connections: The information on the ssh debug services. - repo_policy_compliance: The configuration of the repo policy compliance service. - """ - - proxy_config: ProxyConfig | None - dockerhub_mirror: str | None - ssh_debug_connections: list[SSHDebugConnection] | None - repo_policy_compliance: RepoPolicyComplianceConfig | None - - -@dataclass -class CloudRunnerInstance: - """Information on the runner on the cloud. - - Attributes: - name: Name of the instance hosting the runner. - instance_id: ID of the instance. - health: Health state of the runner. - state: State of the instance hosting the runner. - """ - - name: str - instance_id: InstanceId - health: HealthState - state: CloudRunnerState - - -class CloudRunnerManager(abc.ABC): - """Manage runner instance on cloud. - - Attributes: - name_prefix: The name prefix of the self-hosted runners. - """ - - @property - @abc.abstractmethod - def name_prefix(self) -> str: - """Get the name prefix of the self-hosted runners.""" - - @abc.abstractmethod - def create_runner(self, registration_token: str) -> InstanceId: - """Create a self-hosted runner. - - Args: - registration_token: The GitHub registration token for registering runners. - """ - - @abc.abstractmethod - def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: - """Get a self-hosted runner by instance id. - - Args: - instance_id: The instance id. - """ - - @abc.abstractmethod - def get_runners(self, states: Sequence[CloudRunnerState]) -> Tuple[CloudRunnerInstance]: - """Get self-hosted runners by state. - - Args: - states: Filter for the runners with these github states. If None all states will be - included. - """ - - @abc.abstractmethod - def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMetrics | None: - """Delete self-hosted runner. - - Args: - instance_id: The instance id of the runner to delete. - remove_token: The GitHub remove token. - """ - - @abc.abstractmethod - def flush_runners(self, remove_token: str, busy: bool = False) -> Iterator[RunnerMetrics]: - """Stop all runners. - - Args: - remove_token: The GitHub remove token for removing runners. - busy: If false, only idle runners are removed. If true, both idle and busy runners are - removed. - """ - - @abc.abstractmethod - def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: - """Cleanup runner and resource on the cloud. - - Perform health check on runner and delete the runner if it fails. - - Args: - remove_token: The GitHub remove token for removing runners. - """ diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py deleted file mode 100644 index 949a1df38..000000000 --- a/src/manager/github_runner_manager.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Client for managing self-hosted runner on GitHub side.""" - -from enum import Enum, auto -from typing import Iterable - -from charm_state import GitHubPath -from github_client import GithubClient -from github_type import GitHubRunnerStatus, SelfHostedRunner - - -class GitHubRunnerState(str, Enum): - """State of the self-hosted runner on GitHub. - - Attributes: - BUSY: Runner is working on a job assigned by GitHub. - IDLE: Runner is waiting to take a job or is running pre-job tasks (i.e. - repo-policy-compliance check). - OFFLINE: Runner is not connected to GitHub. - """ - - BUSY = auto() - IDLE = auto() - OFFLINE = auto() - - @staticmethod - def from_runner(runner: SelfHostedRunner) -> "GitHubRunnerState": - """Construct the object from GtiHub runner information. - - Args: - runner: Information on the GitHub self-hosted runner. - - Returns: - The state of runner. - """ - state = GitHubRunnerState.OFFLINE - # A runner that is busy and offline is possible. - if runner["busy"]: - state = GitHubRunnerState.BUSY - if runner["status"] == GitHubRunnerStatus.ONLINE: - if not runner["busy"]: - state = GitHubRunnerState.IDLE - return state - - -# Thin wrapper around the GitHub Client. Not much value in unit testing. -class GitHubRunnerManager: # pragma: no cover - """Manage self-hosted runner on GitHub side.""" - - def __init__(self, prefix: str, token: str, path: GitHubPath): - """Construct the object. - - Args: - prefix: The prefix in the name to identify the runners managed by this instance. - token: The GitHub personal access token to access the GitHub API. - path: The GitHub repository or organization to register the runners under. - """ - self._prefix = prefix - self._path = path - self.github = GithubClient(token) - - def get_runners( - self, states: Iterable[GitHubRunnerState] | None = None - ) -> tuple[SelfHostedRunner, ...]: - """Get info on self-hosted runners of certain states. - - Args: - states: Filter the runners for these states. If None, all runners are returned. - - Returns: - Information on the runners. - """ - runner_list = self.github.get_runner_github_info(self._path) - runner_list = [runner for runner in runner_list if runner.name.startswith(self._prefix)] - - if states is None: - return tuple(runner_list) - - state_set = set(states) - return tuple( - runner - for runner in runner_list - if GitHubRunnerManager._is_runner_in_state(runner, state_set) - ) - - def delete_runners(self, states: Iterable[GitHubRunnerState] | None = None) -> None: - """Delete the self-hosted runners of certain states. - - Args: - states: Filter the runners for these states. If None, all runners are deleted. - """ - runner_list = self.get_runners(states) - for runner in runner_list: - self.github.delete_runner(self._path, runner.id) - - def get_registration_token(self) -> str: - """Get registration token from GitHub. - - This token is used for registering self-hosted runners. - - Returns: - The registration token. - """ - return self.github.get_runner_registration_token(self._path) - - def get_removal_token(self) -> str: - """Get removal token from GitHub. - - This token is used for removing self-hosted runners. - - Returns: - The removal token. - """ - return self.github.get_runner_remove_token(self._path) - - @staticmethod - def _is_runner_in_state( - runner: SelfHostedRunner, states: set[GitHubRunnerState] | None - ) -> bool: - """Check that the runner is in one of the states provided. - - Args: - runner: Runner to filter. - states: States in which to check the runner belongs to. - - Returns: - True if the runner is in one of the state, else false. - """ - if states is None: - return True - return GitHubRunnerState.from_runner(runner) in states diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py deleted file mode 100644 index 72ded77fb..000000000 --- a/src/manager/runner_manager.py +++ /dev/null @@ -1,364 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Class for managing the GitHub self-hosted runners hosted on cloud instances.""" - -import logging -from dataclasses import dataclass -from enum import Enum, auto -from multiprocessing import Pool -from typing import Iterator, Sequence, Type, cast - -from charm_state import GitHubPath -from errors import GithubMetricsError, RunnerCreateError -from github_type import SelfHostedRunner -from manager.cloud_runner_manager import ( - CloudRunnerInstance, - CloudRunnerManager, - CloudRunnerState, - HealthState, - InstanceId, -) -from manager.github_runner_manager import GitHubRunnerManager, GitHubRunnerState -from metrics import events as metric_events -from metrics import github as github_metrics -from metrics import runner as runner_metrics -from metrics.runner import RunnerMetrics - -logger = logging.getLogger(__name__) - -IssuedMetricEventsStats = dict[Type[metric_events.Event], int] - - -class FlushMode(Enum): - """Strategy for flushing runners. - - Attributes: - FLUSH_IDLE: Flush idle runners. - FLUSH_BUSY: Flush busy runners. - """ - - FLUSH_IDLE = auto() - FLUSH_BUSY = auto() - - -@dataclass -class RunnerInstance: - """Represents an instance of runner. - - Attributes: - name: Full name of the runner. Managed by the cloud runner manager. - instance_id: ID of the runner. Managed by the runner manager. - health: The health state of the runner. - github_state: State on github. - cloud_state: State on cloud. - """ - - name: str - instance_id: InstanceId - health: HealthState - github_state: GitHubRunnerState | None - cloud_state: CloudRunnerState - - def __init__(self, cloud_instance: CloudRunnerInstance, github_info: SelfHostedRunner | None): - """Construct an instance. - - Args: - cloud_instance: Information on the cloud instance. - github_info: Information on the GitHub of the runner. - """ - self.name = cloud_instance.name - self.instance_id = cloud_instance.instance_id - self.health = cloud_instance.health - self.github_state = ( - GitHubRunnerState.from_runner(github_info) if github_info is not None else None - ) - self.cloud_state = cloud_instance.state - - -@dataclass -class RunnerManagerConfig: - """Configuration for the runner manager. - - Attributes: - token: GitHub personal access token to query GitHub API. - path: Path to GitHub repository or organization to registry the runners. - """ - - token: str - path: GitHubPath - - -class RunnerManager: - """Manage the runners. - - Attributes: - manager_name: A name to identify this manager. - name_prefix: The name prefix of the runners. - """ - - def __init__( - self, - manager_name: str, - cloud_runner_manager: CloudRunnerManager, - config: RunnerManagerConfig, - ): - """Construct the object. - - Args: - manager_name: A name to identify this manager. - cloud_runner_manager: For managing the cloud instance of the runner. - config: Configuration of this class. - """ - self.manager_name = manager_name - self._config = config - self._cloud = cloud_runner_manager - self.name_prefix = self._cloud.name_prefix - self._github = GitHubRunnerManager( - prefix=self.name_prefix, token=self._config.token, path=self._config.path - ) - - def create_runners(self, num: int) -> tuple[InstanceId]: - """Create runners. - - Args: - num: Number of runners to create. - - Returns: - List of instance ID of the runners. - """ - logger.info("Creating %s runners", num) - registration_token = self._github.get_registration_token() - - create_runner_args = [ - RunnerManager._CreateRunnerArgs(self._cloud, registration_token) for _ in range(num) - ] - return RunnerManager._spawn_runners(create_runner_args) - - def get_runners( - self, - github_states: Sequence[GitHubRunnerState] | None = None, - cloud_states: Sequence[CloudRunnerState] | None = None, - ) -> tuple[RunnerInstance]: - """Get information on runner filter by state. - - Only runners that has cloud instance are returned. - - Args: - github_states: Filter for the runners with these github states. If None all - states will be included. - cloud_states: Filter for the runners with these cloud states. If None all states - will be included. - - Returns: - Information on the runners. - """ - logger.info("Getting runners...") - github_infos = self._github.get_runners(github_states) - cloud_infos = self._cloud.get_runners(cloud_states) - github_infos_map = {info["name"]: info for info in github_infos} - cloud_infos_map = {info.name: info for info in cloud_infos} - logger.info( - "Found following runners: %s", cloud_infos_map.keys() | github_infos_map.keys() - ) - - runner_names = cloud_infos_map.keys() & github_infos_map.keys() - cloud_only = cloud_infos_map.keys() - runner_names - github_only = github_infos_map.keys() - runner_names - if cloud_only: - logger.warning( - "Found runner instance on cloud but not registered on GitHub: %s", cloud_only - ) - if github_only: - logger.warning( - "Found self-hosted runner on GitHub but no matching runner instance on cloud: %s", - github_only, - ) - - runner_instances: list[RunnerInstance] = [ - RunnerInstance( - cloud_infos_map[name], github_infos_map[name] if name in github_infos_map else None - ) - for name in cloud_infos_map.keys() - ] - if cloud_states is not None: - runner_instances = [ - runner for runner in runner_instances if runner.cloud_state in cloud_states - ] - if github_states is not None: - runner_instances = [ - runner - for runner in runner_instances - if runner.github_state is not None and runner.github_state in github_states - ] - return cast(tuple[RunnerInstance], tuple(runner_instances)) - - def delete_runners(self, num: int) -> IssuedMetricEventsStats: - """Delete runners. - - Args: - num: The number of runner to delete. - - Returns: - Stats on metrics events issued during the deletion of runners. - """ - logger.info("Deleting %s number of runners", num) - runners_list = self.get_runners()[:num] - runner_names = [runner.name for runner in runners_list] - logger.info("Deleting runners: %s", runner_names) - remove_token = self._github.get_removal_token() - return self._delete_runners(runners=runners_list, remove_token=remove_token) - - def flush_runners( - self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE - ) -> IssuedMetricEventsStats: - """Delete runners according to state. - - Args: - flush_mode: The type of runners affect by the deletion. - - Returns: - Stats on metrics events issued during the deletion of runners. - """ - match flush_mode: - case FlushMode.FLUSH_IDLE: - logger.info("Flushing idle runners...") - case FlushMode.FLUSH_BUSY: - logger.info("Flushing idle and busy runners...") - case _: - logger.critical( - "Unknown flush mode %s encountered, contact developers", flush_mode - ) - - busy = False - if flush_mode == FlushMode.FLUSH_BUSY: - busy = True - remove_token = self._github.get_removal_token() - stats = self._cloud.flush_runners(remove_token, busy) - return self._issue_runner_metrics(metrics=stats) - - def cleanup(self) -> IssuedMetricEventsStats: - """Run cleanup of the runners and other resources. - - Returns: - Stats on metrics events issued during the cleanup of runners. - """ - self._github.delete_runners([GitHubRunnerState.OFFLINE]) - remove_token = self._github.get_removal_token() - deleted_runner_metrics = self._cloud.cleanup(remove_token) - return self._issue_runner_metrics(metrics=deleted_runner_metrics) - - @staticmethod - def _spawn_runners( - create_runner_args: Sequence["RunnerManager._CreateRunnerArgs"], - ) -> tuple[InstanceId, ...]: - """Parallel spawn of runners. - - The length of the create_runner_args is number _create_runner invocation, and therefore the - number of runner spawned. - - Args: - create_runner_args: List of arg for invoking _create_runner method. - - Returns: - A list of instance ID of runner spawned. - """ - num = len(create_runner_args) - - instance_id_list = [] - with Pool(processes=min(num, 10)) as pool: - jobs = pool.imap_unordered( - func=RunnerManager._create_runner, iterable=create_runner_args - ) - for _ in range(num): - try: - instance_id = next(jobs) - except RunnerCreateError: - logger.exception("Failed to spawn a runner.") - except StopIteration: - break - else: - instance_id_list.append(instance_id) - return tuple(instance_id_list) - - def _delete_runners( - self, runners: Sequence[RunnerInstance], remove_token: str - ) -> IssuedMetricEventsStats: - """Delete list of runners. - - Args: - runners: The runners to delete. - remove_token: The token for removing self-hosted runners. - - Returns: - Stats on metrics events issued during the deletion of runners. - """ - runner_metrics_list = [] - for runner in runners: - deleted_runner_metrics = self._cloud.delete_runner( - instance_id=runner.instance_id, remove_token=remove_token - ) - if deleted_runner_metrics is not None: - runner_metrics_list.append(deleted_runner_metrics) - return self._issue_runner_metrics(metrics=iter(runner_metrics_list)) - - def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetricEventsStats: - """Issue runner metrics. - - Args: - metrics: Runner metrics to issue. - - Returns: - Stats on runner metrics issued. - """ - total_stats: IssuedMetricEventsStats = {} - - for extracted_metrics in metrics: - try: - job_metrics = github_metrics.job( - github_client=self._github.github, - pre_job_metrics=extracted_metrics.pre_job, - runner_name=extracted_metrics.runner_name, - ) - except GithubMetricsError: - logger.exception( - "Failed to calculate job metrics for %s", extracted_metrics.runner_name - ) - job_metrics = None - - issued_events = runner_metrics.issue_events( - runner_metrics=extracted_metrics, - job_metrics=job_metrics, - flavor=self.manager_name, - ) - - for event_type in issued_events: - total_stats[event_type] = total_stats.get(event_type, 0) + 1 - - return total_stats - - @dataclass - class _CreateRunnerArgs: - """Arguments for the _create_runner function. - - Attrs: - cloud_runner_manager: For managing the cloud instance of the runner. - registration_token: The GitHub provided-token for registering runners. - """ - - cloud_runner_manager: CloudRunnerManager - registration_token: str - - @staticmethod - def _create_runner(args: _CreateRunnerArgs) -> InstanceId: - """Create a single runner. - - This is a staticmethod for usage with multiprocess.Pool. - - Args: - args: The arguments. - - Returns: - The instance ID of the runner created. - """ - return args.cloud_runner_manager.create_runner(registration_token=args.registration_token) diff --git a/src/manager/runner_scaler.py b/src/manager/runner_scaler.py deleted file mode 100644 index 271b92e51..000000000 --- a/src/manager/runner_scaler.py +++ /dev/null @@ -1,215 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Module for scaling the runners amount.""" - -import logging -import time -from dataclasses import dataclass - -from pydantic import MongoDsn - -import reactive.runner_manager as reactive_runner_manager -from charm_state import ReactiveConfig -from errors import IssueMetricEventError, MissingServerConfigError -from manager.cloud_runner_manager import HealthState -from manager.github_runner_manager import GitHubRunnerState -from manager.runner_manager import FlushMode, RunnerManager -from metrics import events as metric_events - -logger = logging.getLogger(__name__) - - -@dataclass -class RunnerInfo: - """Information on the runners. - - Attributes: - online: The number of runner in online state. - busy: The number of the runner in busy state. - offline: The number of runner in offline state. - unknown: The number of runner in unknown state. - runners: The names of the online runners. - busy_runners: The names of the busy runners. - """ - - online: int - busy: int - offline: int - unknown: int - runners: tuple[str, ...] - busy_runners: tuple[str, ...] - - -class RunnerScaler: - """Manage the reconcile of runners.""" - - def __init__(self, runner_manager: RunnerManager, reactive_config: ReactiveConfig | None): - """Construct the object. - - Args: - runner_manager: The RunnerManager to perform runner reconcile. - reactive_config: Reactive runner configuration. - """ - self._manager = runner_manager - self._reactive_config = reactive_config - - def get_runner_info(self) -> RunnerInfo: - """Get information on the runners. - - Returns: - The information on the runners. - """ - runner_list = self._manager.get_runners() - online = 0 - busy = 0 - offline = 0 - unknown = 0 - online_runners = [] - busy_runners = [] - for runner in runner_list: - match runner.github_state: - case GitHubRunnerState.BUSY: - online += 1 - online_runners.append(runner.name) - busy += 1 - busy_runners.append(runner.name) - case GitHubRunnerState.IDLE: - online += 1 - online_runners.append(runner.name) - case GitHubRunnerState.OFFLINE: - offline += 1 - case _: - unknown += 1 - return RunnerInfo( - online=online, - busy=busy, - offline=offline, - unknown=unknown, - runners=tuple(online_runners), - busy_runners=tuple(busy_runners), - ) - - def flush(self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: - """Flush the runners. - - Args: - flush_mode: Determines the types of runner to be flushed. - - Returns: - Number of runners flushed. - """ - metric_stats = self._manager.cleanup() - delete_metric_stats = self._manager.flush_runners(flush_mode=flush_mode) - events = set(delete_metric_stats.keys()) | set(metric_stats.keys()) - metric_stats = { - event_name: delete_metric_stats.get(event_name, 0) + metric_stats.get(event_name, 0) - for event_name in events - } - return metric_stats.get(metric_events.RunnerStop, 0) - - def reconcile(self, quantity: int) -> int: - """Reconcile the quantity of runners. - - Args: - quantity: The number of intended runners. - - Returns: - The Change in number of runners. - """ - logger.info("Start reconcile to %s runner", quantity) - - if self._reactive_config is not None: - logger.info("Reactive configuration detected, going into experimental reactive mode.") - return self._reconcile_reactive(quantity, self._reactive_config.mq_uri) - - start_timestamp = time.time() - delete_metric_stats = None - metric_stats = self._manager.cleanup() - runners = self._manager.get_runners() - logger.info("Reconcile runners from %s to %s", len(runners), quantity) - runner_diff = quantity - len(runners) - if runner_diff > 0: - try: - self._manager.create_runners(runner_diff) - except MissingServerConfigError: - logging.exception( - "Unable to spawn runner due to missing server configuration, such as, image." - ) - elif runner_diff < 0: - delete_metric_stats = self._manager.delete_runners(-runner_diff) - else: - logger.info("No changes to the number of runners.") - end_timestamp = time.time() - - # Merge the two metric stats. - if delete_metric_stats is not None: - metric_stats = { - event_name: delete_metric_stats.get(event_name, 0) - + metric_stats.get(event_name, 0) - for event_name in set(delete_metric_stats) | set(metric_stats) - } - - runner_list = self._manager.get_runners() - busy_runners = [ - runner for runner in runner_list if runner.github_state == GitHubRunnerState.BUSY - ] - idle_runners = [ - runner for runner in runner_list if runner.github_state == GitHubRunnerState.IDLE - ] - offline_healthy_runners = [ - runner - for runner in runner_list - if runner.github_state == GitHubRunnerState.OFFLINE - and runner.health == HealthState.HEALTHY - ] - unhealthy_states = set((HealthState.UNHEALTHY, HealthState.UNKNOWN)) - unhealthy_runners = [runner for runner in runner_list if runner.health in unhealthy_states] - logger.info("Found %s busy runners: %s", len(busy_runners), busy_runners) - logger.info("Found %s idle runners: %s", len(idle_runners), idle_runners) - logger.info( - "Found %s offline runners that are healthy: %s", - len(offline_healthy_runners), - offline_healthy_runners, - ) - logger.info("Found %s unhealthy runners: %s", len(unhealthy_runners), unhealthy_runners) - - try: - available_runners = set(runner.name for runner in idle_runners) | set( - runner.name for runner in offline_healthy_runners - ) - logger.info( - "Current available runners (idle + healthy offline): %s", available_runners - ) - metric_events.issue_event( - metric_events.Reconciliation( - timestamp=time.time(), - flavor=self._manager.manager_name, - crashed_runners=metric_stats.get(metric_events.RunnerStart, 0) - - metric_stats.get(metric_events.RunnerStop, 0), - idle_runners=len(available_runners), - duration=end_timestamp - start_timestamp, - ) - ) - except IssueMetricEventError: - logger.exception("Failed to issue Reconciliation metric") - - return runner_diff - - def _reconcile_reactive(self, quantity: int, mq_uri: MongoDsn) -> int: - """Reconcile runners reactively. - - Args: - quantity: Number of intended runners. - mq_uri: The URI of the MQ to use to spawn runners reactively. - - Returns: - The difference between intended runners and actual runners. In reactive mode - this number is never negative as additional processes should terminate after a timeout. - """ - logger.info("Reactive mode is experimental and not yet fully implemented.") - return reactive_runner_manager.reconcile( - quantity=quantity, - mq_uri=mq_uri, - queue_name=self._manager.manager_name, - ) diff --git a/src/metrics/__init__.py b/src/metrics/__init__.py deleted file mode 100644 index d2a48eaed..000000000 --- a/src/metrics/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Package for common metrics-related code.""" diff --git a/src/metrics/events.py b/src/metrics/events.py deleted file mode 100644 index 6f858166d..000000000 --- a/src/metrics/events.py +++ /dev/null @@ -1,167 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Models and functions for the metric events.""" -import logging -from pathlib import Path -from typing import Any, Optional - -from pydantic import BaseModel, NonNegativeFloat - -from errors import IssueMetricEventError - -METRICS_LOG_PATH = Path("/var/log/github-runner-metrics.log") - - -logger = logging.getLogger(__name__) - - -class Event(BaseModel): - """Base class for metric events. - - Attributes: - timestamp: The UNIX time stamp of the time at which the event was originally issued. - event: The name of the event. Will be set to the class name in snake case if not provided. - """ - - timestamp: NonNegativeFloat - event: str - - @staticmethod - def _camel_to_snake(camel_case_string: str) -> str: - """Convert a camel case string to snake case. - - Args: - camel_case_string: The string to convert. - - Returns: - The converted string. - """ - snake_case_string = camel_case_string[0].lower() - for char in camel_case_string[1:]: - if char.isupper(): - snake_case_string += "_" + char.lower() - else: - snake_case_string += char - return snake_case_string - - def __init__(self, *args: Any, **kwargs: Any): - """Initialize the event. - - Args: - args: The positional arguments to pass to the base class. - kwargs: The keyword arguments to pass to the base class. These are used to set the - specific fields. E.g. timestamp=12345 will set the timestamp field to 12345. - """ - if "event" not in kwargs: - event = self._camel_to_snake(self.__class__.__name__) - kwargs["event"] = event - super().__init__(*args, **kwargs) - - -class RunnerInstalled(Event): - """Metric event for when a runner is installed. - - Attributes: - flavor: Describes the characteristics of the runner. - The flavor could be for example "small". - duration: The duration of the installation in seconds. - """ - - flavor: str - duration: NonNegativeFloat - - -class RunnerStart(Event): - """Metric event for when a runner is started. - - Attributes: - flavor: Describes the characteristics of the runner. - The flavor could be for example "small". - workflow: The workflow name. - repo: The repository name. - github_event: The github event. - idle: The idle time in seconds. - queue_duration: The time in seconds it took before the runner picked up the job. - This is optional as we rely on the Github API and there may be problems - retrieving the data. - """ - - flavor: str - workflow: str - repo: str - github_event: str - idle: NonNegativeFloat - queue_duration: Optional[NonNegativeFloat] - - -class CodeInformation(BaseModel): - """Information about a status code. - - This could e.g. be an exit code or a http status code. - - Attributes: - code: The status code. - """ - - code: int - - -class RunnerStop(Event): - """Metric event for when a runner is stopped. - - Attributes: - flavor: Describes the characteristics of the runner. - The flavor could be for example "small". - workflow: The workflow name. - repo: The repository name. - github_event: The github event. - status: A string describing the reason for stopping the runner. - status_info: More information about the status. - job_duration: The duration of the job in seconds. - job_conclusion: The job conclusion, e.g. "success", "failure", ... - """ - - flavor: str - workflow: str - repo: str - github_event: str - status: str - status_info: Optional[CodeInformation] - job_duration: NonNegativeFloat - job_conclusion: Optional[str] - - -class Reconciliation(Event): - """Metric event for when the charm has finished reconciliation. - - Attributes: - flavor: Describes the characteristics of the runner. - The flavor could be for example "small". - crashed_runners: The number of crashed runners. - idle_runners: The number of idle runners. - duration: The duration of the reconciliation in seconds. - """ - - flavor: str - crashed_runners: int - idle_runners: int - duration: NonNegativeFloat - - -def issue_event(event: Event) -> None: - """Issue a metric event. - - The metric event is logged to the metrics log. - - Args: - event: The metric event to log. - - Raises: - IssueMetricEventError: If the event cannot be logged. - """ - try: - with METRICS_LOG_PATH.open(mode="a", encoding="utf-8") as metrics_file: - metrics_file.write(f"{event.json(exclude_none=True)}\n") - except OSError as exc: - raise IssueMetricEventError(f"Cannot write to {METRICS_LOG_PATH}") from exc diff --git a/src/metrics/github.py b/src/metrics/github.py deleted file mode 100644 index e40574eb7..000000000 --- a/src/metrics/github.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Functions to calculate metrics from data retrieved from GitHub.""" -import logging - -from charm_state import GitHubRepo -from errors import GithubMetricsError, JobNotFoundError -from github_client import GithubClient -from metrics.runner import PreJobMetrics -from metrics.type import GithubJobMetrics - -logger = logging.getLogger(__name__) - - -def job( - github_client: GithubClient, pre_job_metrics: PreJobMetrics, runner_name: str -) -> GithubJobMetrics: - """Calculate the job metrics for a runner. - - The Github API is accessed to retrieve the job data for the runner. - - Args: - github_client: The GitHub API client. - pre_job_metrics: The pre-job metrics. - runner_name: The name of the runner. - - Raises: - GithubMetricsError: If the job for given workflow run is not found. - - Returns: - The job metrics. - """ - owner, repo = pre_job_metrics.repository.split("/", maxsplit=1) - - try: - job_info = github_client.get_job_info( - path=GitHubRepo(owner=owner, repo=repo), - workflow_run_id=pre_job_metrics.workflow_run_id, - runner_name=runner_name, - ) - except JobNotFoundError as exc: - raise GithubMetricsError from exc - logger.debug( - "Job info for runner %s with workflow run id %s: %s", - runner_name, - pre_job_metrics.workflow_run_id, - job_info, - ) - - queue_duration = (job_info.started_at - job_info.created_at).total_seconds() - - return GithubJobMetrics(queue_duration=queue_duration, conclusion=job_info.conclusion) diff --git a/src/metrics/runner.py b/src/metrics/runner.py deleted file mode 100644 index b0ccc191a..000000000 --- a/src/metrics/runner.py +++ /dev/null @@ -1,470 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Classes and function to extract the metrics from storage and issue runner metrics events.""" - -import json -import logging -from enum import Enum -from json import JSONDecodeError -from pathlib import Path -from typing import Iterator, Optional, Type - -from pydantic import BaseModel, Field, NonNegativeFloat, ValidationError - -from errors import ( - CorruptMetricDataError, - DeleteMetricsStorageError, - IssueMetricEventError, - RunnerMetricsError, -) -from metrics import events as metric_events -from metrics.storage import MetricsStorage -from metrics.storage import StorageManager as MetricsStorageManager -from metrics.storage import move_to_quarantine -from metrics.type import GithubJobMetrics - -logger = logging.getLogger(__name__) - -FILE_SIZE_BYTES_LIMIT = 1024 -PRE_JOB_METRICS_FILE_NAME = "pre-job-metrics.json" -POST_JOB_METRICS_FILE_NAME = "post-job-metrics.json" -RUNNER_INSTALLED_TS_FILE_NAME = "runner-installed.timestamp" - - -class PreJobMetrics(BaseModel): - """Metrics for the pre-job phase of a runner. - - Attributes: - timestamp: The UNIX time stamp of the time at which the event was originally issued. - workflow: The workflow name. - workflow_run_id: The workflow run id. - repository: The repository path in the format '/'. - event: The github event. - """ - - timestamp: NonNegativeFloat - workflow: str - workflow_run_id: str - repository: str = Field(None, regex=r"^.+/.+$") - event: str - - -class PostJobStatus(str, Enum): - """The status of the post-job phase of a runner. - - Attributes: - NORMAL: Represents a normal post-job. - ABNORMAL: Represents an error with post-job. - REPO_POLICY_CHECK_FAILURE: Represents an error with repo-policy-compliance check. - """ - - NORMAL = "normal" - ABNORMAL = "abnormal" - REPO_POLICY_CHECK_FAILURE = "repo-policy-check-failure" - - -class CodeInformation(BaseModel): - """Information about a status code. - - Attributes: - code: The status code. - """ - - code: int - - -class PostJobMetrics(BaseModel): - """Metrics for the post-job phase of a runner. - - Attributes: - timestamp: The UNIX time stamp of the time at which the event was originally issued. - status: The status of the job. - status_info: More information about the status. - """ - - timestamp: NonNegativeFloat - status: PostJobStatus - status_info: Optional[CodeInformation] - - -class RunnerMetrics(BaseModel): - """Metrics for a runner. - - Attributes: - installed_timestamp: The UNIX time stamp of the time at which the runner was installed. - pre_job: The metrics for the pre-job phase. - post_job: The metrics for the post-job phase. - runner_name: The name of the runner. - """ - - installed_timestamp: NonNegativeFloat - pre_job: PreJobMetrics - post_job: Optional[PostJobMetrics] - runner_name: str - - -def extract( - metrics_storage_manager: MetricsStorageManager, runners: set[str], include: bool = False -) -> Iterator[RunnerMetrics]: - """Extract metrics from runners. - - The metrics are extracted from the metrics storage of the runners. - Orphan storages are cleaned up. - - If corrupt data is found, the metrics are not processed further and the storage is moved - to a special quarantine directory, as this may indicate that a malicious - runner is trying to manipulate the files on the storage. - - In order to avoid DoS attacks, the file size is also checked. - - Args: - metrics_storage_manager: The metrics storage manager. - runners: The runners to include or exclude. - include: If true the provided runners are included for metric extraction, else the provided - runners are excluded. - - Yields: - Extracted runner metrics of a particular runner. - """ - for ms in metrics_storage_manager.list_all(): - if (include and ms.runner_name in runners) or ( - not include and ms.runner_name not in runners - ): - runner_metrics = _extract_storage( - metrics_storage_manager=metrics_storage_manager, metrics_storage=ms - ) - if not runner_metrics: - logger.warning("Not able to issue metrics for runner %s", ms.runner_name) - else: - yield runner_metrics - - -def issue_events( - runner_metrics: RunnerMetrics, - flavor: str, - job_metrics: Optional[GithubJobMetrics], -) -> set[Type[metric_events.Event]]: - """Issue the metrics events for a runner. - - Args: - runner_metrics: The metrics for the runner. - flavor: The flavor of the runner. - job_metrics: The metrics about the job run by the runner. - - Returns: - A set of issued events. - """ - runner_start_event = _create_runner_start(runner_metrics, flavor, job_metrics) - - issued_events = set() - try: - metric_events.issue_event(runner_start_event) - except ValidationError: - logger.exception( - "Not able to issue RunnerStart metric for " - "runner %s with pre-job metrics %s and job_metrics %s." - "Will not issue RunnerStop metric.", - runner_metrics.runner_name, - runner_metrics.pre_job, - job_metrics, - ) - except IssueMetricEventError: - logger.exception( - "Not able to issue RunnerStart metric for runner %s. " - "Will not issue RunnerStop metric.", - runner_metrics.runner_name, - ) - else: - issued_events = {metric_events.RunnerStart} - - # Return to not issuing RunnerStop metrics if RunnerStart metric could not be issued. - if not issued_events: - return issued_events - - if runner_metrics.post_job: - runner_stop_event = _create_runner_stop(runner_metrics, flavor, job_metrics) - - try: - metric_events.issue_event(runner_stop_event) - except ValidationError: - logger.exception( - "Not able to issue RunnerStop metric for " - "runner %s with pre-job metrics %s, post-job metrics %s and job_metrics %s.", - runner_metrics.runner_name, - runner_metrics.pre_job, - runner_metrics.post_job, - job_metrics, - ) - except IssueMetricEventError: - logger.exception( - "Not able to issue RunnerStop metric for runner %s.", runner_metrics.runner_name - ) - return issued_events - - issued_events.add(metric_events.RunnerStop) - - return issued_events - - -def _create_runner_start( - runner_metrics: RunnerMetrics, flavor: str, job_metrics: Optional[GithubJobMetrics] -) -> metric_events.RunnerStart: - """Create the RunnerStart event. - - Args: - runner_metrics: The metrics for the runner. - flavor: The flavor of the runner. - job_metrics: The metrics about the job run by the runner. - - Returns: - The RunnerStart event. - """ - # When a job gets picked up directly after spawning, the runner_metrics installed timestamp - # might be higher than the pre-job timestamp. This is due to the fact that we issue the runner - # installed timestamp for Openstack after waiting with delays for the runner to be ready. - # We set the idle_duration to 0 in this case. - if runner_metrics.pre_job.timestamp < runner_metrics.installed_timestamp: - logger.warning( - "Pre-job timestamp %d is before installed timestamp %d for runner %s." - " Setting idle_duration to zero", - runner_metrics.pre_job.timestamp, - runner_metrics.installed_timestamp, - runner_metrics.runner_name, - ) - idle_duration = max(runner_metrics.pre_job.timestamp - runner_metrics.installed_timestamp, 0) - - # GitHub API returns started_at < created_at in some rare cases. - if job_metrics and job_metrics.queue_duration < 0: - logger.warning( - "Queue duration for runner %s is negative: %f. Setting it to zero.", - runner_metrics.runner_name, - job_metrics.queue_duration, - ) - queue_duration = max(job_metrics.queue_duration, 0) if job_metrics else None - - return metric_events.RunnerStart( - timestamp=runner_metrics.pre_job.timestamp, - flavor=flavor, - workflow=runner_metrics.pre_job.workflow, - repo=runner_metrics.pre_job.repository, - github_event=runner_metrics.pre_job.event, - idle=idle_duration, - queue_duration=queue_duration, - ) - - -def _create_runner_stop( - runner_metrics: RunnerMetrics, flavor: str, job_metrics: GithubJobMetrics -) -> metric_events.RunnerStop: - """Create the RunnerStop event. - - Expects that the runner_metrics.post_job is not None. - - Args: - runner_metrics: The metrics for the runner. - flavor: The flavor of the runner. - job_metrics: The metrics about the job run by the runner. - - Raises: - RunnerMetricsError: Post job runner metric not found. Should not happen. - - Returns: - The RunnerStop event. - """ - if runner_metrics.post_job is None: - raise RunnerMetricsError( - "Post job runner metric not found during RunnerStop event, contact developers" - ) - - # When a job gets cancelled directly after spawning, - # the post-job timestamp might be lower then the pre-job timestamp. - # This is due to the fact that we don't have a real post-job script but rather use - # the exit code of the runner application which might exit before the pre-job script - # job is done in edge cases. See also: - # https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/running-scripts-before-or-after-a-job#triggering-the-scripts - # We set the job_duration to 0 in this case. - if runner_metrics.post_job.timestamp < runner_metrics.pre_job.timestamp: - logger.warning( - "Post-job timestamp %d is before pre-job timestamp %d for runner %s." - " Setting job_duration to zero", - runner_metrics.post_job.timestamp, - runner_metrics.pre_job.timestamp, - runner_metrics.runner_name, - ) - job_duration = max(runner_metrics.post_job.timestamp - runner_metrics.pre_job.timestamp, 0) - - return metric_events.RunnerStop( - timestamp=runner_metrics.post_job.timestamp, - flavor=flavor, - workflow=runner_metrics.pre_job.workflow, - repo=runner_metrics.pre_job.repository, - github_event=runner_metrics.pre_job.event, - status=runner_metrics.post_job.status, - status_info=runner_metrics.post_job.status_info, - job_duration=job_duration, - job_conclusion=job_metrics.conclusion if job_metrics else None, - ) - - -def _extract_storage( - metrics_storage_manager: MetricsStorageManager, - metrics_storage: MetricsStorage, -) -> Optional[RunnerMetrics]: - """Extract metrics from a metrics storage. - - Args: - metrics_storage_manager: The metrics storage manager. - metrics_storage: The metrics storage for a specific runner. - - Returns: - The extracted metrics if at least the pre-job metrics are present. - """ - runner_name = metrics_storage.runner_name - try: - logger.debug("Extracting metrics from metrics storage for runner %s", runner_name) - metrics_from_fs = _extract_metrics_from_storage(metrics_storage) - except CorruptMetricDataError: - logger.exception("Corrupt metric data found for runner %s", runner_name) - move_to_quarantine(metrics_storage_manager, runner_name) - return None - - logger.debug("Cleaning metrics storage for runner %s", runner_name) - _clean_up_storage( - metrics_storage_manager=metrics_storage_manager, metrics_storage=metrics_storage - ) - return metrics_from_fs - - -def _extract_metrics_from_storage(metrics_storage: MetricsStorage) -> Optional[RunnerMetrics]: - """Extract metrics from metrics storage for a runner. - - Args: - metrics_storage: The metrics storage for a specific runner. - - Returns: - The extracted metrics if at least the pre-job metrics are present. - - Raises: - CorruptMetricDataError: Raised if one of the files is not valid or too large. - """ - if too_large_files := _inspect_file_sizes(metrics_storage): - raise CorruptMetricDataError( - f"File size of {too_large_files} is too large. " - f"The limit is {FILE_SIZE_BYTES_LIMIT} bytes." - ) - - runner_name = metrics_storage.runner_name - try: - installed_timestamp = metrics_storage.path.joinpath( - RUNNER_INSTALLED_TS_FILE_NAME - ).read_text() - logger.debug("Runner %s installed at %s", runner_name, installed_timestamp) - except FileNotFoundError: - logger.exception("installed_timestamp not found for runner %s", runner_name) - return None - - try: - pre_job_metrics = _extract_file_from_storage( - metrics_storage=metrics_storage, filename=PRE_JOB_METRICS_FILE_NAME - ) - if not pre_job_metrics: - return None - logger.debug("Pre-job metrics for runner %s: %s", runner_name, pre_job_metrics) - - post_job_metrics = _extract_file_from_storage( - metrics_storage=metrics_storage, filename=POST_JOB_METRICS_FILE_NAME - ) - logger.debug("Post-job metrics for runner %s: %s", runner_name, post_job_metrics) - # TODO: 2024-04-02 - We should define a new error, wrap it and re-raise it. - except CorruptMetricDataError: # pylint: disable=try-except-raise - raise - - try: - return RunnerMetrics( - installed_timestamp=installed_timestamp, - pre_job=PreJobMetrics(**pre_job_metrics), - post_job=PostJobMetrics(**post_job_metrics) if post_job_metrics else None, - runner_name=runner_name, - ) - except ValidationError as exc: - raise CorruptMetricDataError(str(exc)) from exc - - -def _inspect_file_sizes(metrics_storage: MetricsStorage) -> tuple[Path, ...]: - """Inspect the file sizes of the metrics storage. - - Args: - metrics_storage: The metrics storage for a specific runner. - - Returns: - A tuple of files whose size is larger than the limit. - """ - files: list[Path] = [ - metrics_storage.path.joinpath(PRE_JOB_METRICS_FILE_NAME), - metrics_storage.path.joinpath(POST_JOB_METRICS_FILE_NAME), - metrics_storage.path.joinpath(RUNNER_INSTALLED_TS_FILE_NAME), - ] - - return tuple( - filter(lambda file: file.exists() and file.stat().st_size > FILE_SIZE_BYTES_LIMIT, files) - ) - - -def _extract_file_from_storage(metrics_storage: MetricsStorage, filename: str) -> dict | None: - """Extract a particular metric file from metrics storage. - - Args: - metrics_storage: The metrics storage for a specific runner. - filename: The metrics filename. - - Raises: - CorruptMetricDataError: If any errors have been found within the metric. - - Returns: - Metrics for the given runner if present. - """ - try: - job_metrics = json.loads( - metrics_storage.path.joinpath(filename).read_text(encoding="utf-8") - ) - except FileNotFoundError: - logger.warning("%s not found for runner %s.", filename, metrics_storage.runner_name) - return None - except JSONDecodeError as exc: - raise CorruptMetricDataError(str(exc)) from exc - if not isinstance(job_metrics, dict): - raise CorruptMetricDataError( - f"{filename} metrics for runner {metrics_storage.runner_name} is not a JSON object." - ) - return job_metrics - - -def _clean_up_storage( - metrics_storage_manager: MetricsStorageManager, metrics_storage: MetricsStorage -) -> None: - """Clean up the metrics storage. - - Remove all metric files and afterwards the storage. - - Args: - metrics_storage_manager: The metrics storage manager. - metrics_storage: The metrics storage for a specific runner. - """ - try: - metrics_storage.path.joinpath(RUNNER_INSTALLED_TS_FILE_NAME).unlink(missing_ok=True) - metrics_storage.path.joinpath(PRE_JOB_METRICS_FILE_NAME).unlink(missing_ok=True) - metrics_storage.path.joinpath(POST_JOB_METRICS_FILE_NAME).unlink(missing_ok=True) - except OSError: - logger.exception( - "Could not remove metric files for runner %s, " - "this may lead to duplicate metrics issued", - metrics_storage.runner_name, - ) - - try: - metrics_storage_manager.delete(metrics_storage.runner_name) - except DeleteMetricsStorageError: - logger.exception( - "Could not delete metrics storage for runner %s.", metrics_storage.runner_name - ) diff --git a/src/metrics/runner_logs.py b/src/metrics/runner_logs.py deleted file mode 100644 index ec7923c9c..000000000 --- a/src/metrics/runner_logs.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Functions to pull and remove the logs of the crashed runners.""" - -import logging -import shutil -import time -from datetime import datetime -from pathlib import Path - -RUNNER_LOGS_DIR_PATH = Path("/var/log/github-runner-logs") - -SYSLOG_PATH = Path("/var/log/syslog") - -OUTDATED_LOGS_IN_SECONDS = 7 * 24 * 60 * 60 - -logger = logging.getLogger(__name__) - - -def create_logs_dir(runner_name: str) -> Path: - """Create the directory to store the logs of the crashed runners. - - Args: - runner_name: The name of the runner. - - Returns: - The path to the directory where the logs of the crashed runners will be stored. - """ - target_log_path = RUNNER_LOGS_DIR_PATH / runner_name - target_log_path.mkdir(parents=True, exist_ok=True) - - return target_log_path - - -def remove_outdated() -> None: - """Remove the logs that are too old.""" - maxage_absolute = time.time() - OUTDATED_LOGS_IN_SECONDS - dt_object = datetime.fromtimestamp(maxage_absolute) - logger.info( - "Removing the outdated logs of the crashed runners. " - "All logs older than %s will be removed.", - dt_object.strftime("%Y-%m-%d %H:%M:%S"), - ) - - for log_path in RUNNER_LOGS_DIR_PATH.glob("*"): - if log_path.is_dir() and (log_path.stat().st_mtime < maxage_absolute): - logger.info("Removing the outdated logs of the runner %s.", log_path.name) - try: - shutil.rmtree(log_path) - except OSError: - logger.exception( - "Unable to remove the outdated logs of the runner %s.", log_path.name - ) diff --git a/src/metrics/storage.py b/src/metrics/storage.py deleted file mode 100644 index c9b41a2f5..000000000 --- a/src/metrics/storage.py +++ /dev/null @@ -1,192 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Classes and functions defining the metrics storage. - -It contains a protocol and reference implementation. -""" -import logging -import shutil -import tarfile -from dataclasses import dataclass -from pathlib import Path -from typing import Callable, Iterator, Protocol - -from errors import ( - CreateMetricsStorageError, - DeleteMetricsStorageError, - GetMetricsStorageError, - QuarantineMetricsStorageError, -) - -FILESYSTEM_OWNER = "ubuntu:ubuntu" -FILESYSTEM_BASE_PATH = Path("/home/ubuntu/runner-fs") -FILESYSTEM_QUARANTINE_PATH = Path("/home/ubuntu/runner-fs-quarantine") - -logger = logging.getLogger(__name__) - - -@dataclass -class MetricsStorage: - """Storage for the metrics. - - Attributes: - path: The path to the directory holding the metrics inside the charm. - runner_name: The name of the associated runner. - """ - - path: Path - runner_name: str - - -class StorageManager(Protocol): # pylint: disable=too-few-public-methods - """A protocol defining the methods for managing the metrics storage. - - Attributes: - create: Method to create a new storage. Returns the created storage. - Raises an exception CreateMetricsStorageError if the storage already exists. - list_all: Method to list all storages. - get: Method to get a storage by name. - delete: Method to delete a storage by name. - """ - - create: Callable[[str], MetricsStorage] - list_all: Callable[[], Iterator[MetricsStorage]] - get: Callable[[str], MetricsStorage] - delete: Callable[[str], None] - - -def _get_runner_fs_path(runner_name: str) -> Path: - """Get the path of the runner shared filesystem. - - Args: - runner_name: The name of the runner. - - Returns: - The path of the runner shared filesystem. - """ - return FILESYSTEM_BASE_PATH / runner_name - - -def create(runner_name: str) -> MetricsStorage: - """Create metrics storage for the runner. - - The method is not idempotent and will raise an exception - if the storage already exists. - - Args: - runner_name: The name of the runner. - - Returns: - The metrics storage object. - - Raises: - CreateMetricsStorageError: If the creation of the shared filesystem fails. - """ - try: - FILESYSTEM_BASE_PATH.mkdir(exist_ok=True) - FILESYSTEM_QUARANTINE_PATH.mkdir(exist_ok=True) - except OSError as exc: - raise CreateMetricsStorageError("Failed to create metrics storage directories") from exc - - runner_fs_path = _get_runner_fs_path(runner_name) - - try: - runner_fs_path.mkdir() - except FileExistsError as exc: - raise CreateMetricsStorageError( - f"Metrics storage for runner {runner_name} already exists." - ) from exc - - return MetricsStorage(runner_fs_path, runner_name) - - -def list_all() -> Iterator[MetricsStorage]: - """List all the metric storages. - - Yields: - A metrics storage object. - """ - if not FILESYSTEM_BASE_PATH.exists(): - return - - directories = (entry for entry in FILESYSTEM_BASE_PATH.iterdir() if entry.is_dir()) - for directory in directories: - try: - fs = get(runner_name=directory.name) - except GetMetricsStorageError: - logger.error("Failed to get metrics storage for runner %s", directory.name) - else: - yield fs - - -def get(runner_name: str) -> MetricsStorage: - """Get the metrics storage for the runner. - - Args: - runner_name: The name of the runner. - - Returns: - The metrics storage object. - - Raises: - GetMetricsStorageError: If the storage does not exist. - """ - runner_fs_path = _get_runner_fs_path(runner_name) - if not runner_fs_path.exists(): - raise GetMetricsStorageError(f"Metrics storage for runner {runner_name} not found.") - - return MetricsStorage(runner_fs_path, runner_name) - - -def delete(runner_name: str) -> None: - """Delete the metrics storage for the runner. - - Args: - runner_name: The name of the runner. - - Raises: - DeleteMetricsStorageError: If the storage could not be deleted. - """ - runner_fs_path = _get_runner_fs_path(runner_name=runner_name) - - try: - shutil.rmtree(runner_fs_path) - except OSError as exc: - raise DeleteMetricsStorageError( - f"Failed to remove metrics storage for runner {runner_name}" - ) from exc - - -def move_to_quarantine(storage_manager: StorageManager, runner_name: str) -> None: - """Archive the metrics storage for the runner and delete it. - - Args: - storage_manager: The storage manager. - runner_name: The name of the runner. - - Raises: - QuarantineMetricsStorageError: If the metrics storage could not be quarantined. - """ - try: - runner_fs = storage_manager.get(runner_name) - except GetMetricsStorageError as exc: - raise QuarantineMetricsStorageError( - f"Failed to get metrics storage for runner {runner_name}" - ) from exc - - tarfile_path = FILESYSTEM_QUARANTINE_PATH.joinpath(runner_name).with_suffix(".tar.gz") - try: - with tarfile.open(tarfile_path, "w:gz") as tar: - tar.add(runner_fs.path, arcname=runner_fs.path.name) - except OSError as exc: - raise QuarantineMetricsStorageError( - f"Failed to archive metrics storage for runner {runner_name}" - ) from exc - - try: - storage_manager.delete(runner_name) - except DeleteMetricsStorageError as exc: - raise QuarantineMetricsStorageError( - f"Failed to delete metrics storage for runner {runner_name}" - ) from exc diff --git a/src/metrics/type.py b/src/metrics/type.py deleted file mode 100644 index fd45314f6..000000000 --- a/src/metrics/type.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -# Copyright 2023 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Data types used by modules handling metrics.""" - -from typing import NamedTuple, Optional - -from github_type import JobConclusion - - -class GithubJobMetrics(NamedTuple): - """Metrics about a job. - - Attributes: - queue_duration: The time in seconds the job took before the runner picked it up. - conclusion: The conclusion of the job. - """ - - queue_duration: float - conclusion: Optional[JobConclusion] diff --git a/src/openstack_cloud/__init__.py b/src/openstack_cloud/__init__.py deleted file mode 100644 index 3f9935aab..000000000 --- a/src/openstack_cloud/__init__.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Module for managing Openstack cloud.""" - -import logging -from pathlib import Path -from typing import TypedDict, cast - -import yaml - -from errors import OpenStackInvalidConfigError - -logger = logging.getLogger(__name__) - - -CLOUDS_YAML_PATH = Path(Path.home() / ".config/openstack/clouds.yaml") - - -class CloudConfig(TypedDict): - """The parsed clouds.yaml configuration dictionary. - - Attributes: - clouds: A mapping of key "clouds" to cloud name mapped to cloud configuration. - """ - - clouds: dict[str, dict] - - -def _validate_cloud_config(cloud_config: dict) -> CloudConfig: - """Validate the format of the cloud configuration. - - Args: - cloud_config: The configuration in clouds.yaml format to validate. - - Raises: - OpenStackInvalidConfigError: if the format of the config is invalid. - - Returns: - A typed cloud_config dictionary. - """ - # dict of format: {clouds: : } - try: - clouds = list(cloud_config["clouds"].keys()) - except KeyError as exc: - raise OpenStackInvalidConfigError("Missing key 'clouds' from config.") from exc - if not clouds: - raise OpenStackInvalidConfigError("No clouds defined in clouds.yaml.") - return cast(CloudConfig, cloud_config) - - -def _write_config_to_disk(cloud_config: CloudConfig) -> None: - """Write the cloud configuration to disk. - - Args: - cloud_config: The configuration in clouds.yaml format to write to disk. - """ - CLOUDS_YAML_PATH.parent.mkdir(parents=True, exist_ok=True) - CLOUDS_YAML_PATH.write_text(encoding="utf-8", data=yaml.dump(cloud_config)) - - -def initialize(cloud_config: dict) -> None: - """Initialize Openstack integration. - - Validates config and writes it to disk. - - Raises: - OpenStackInvalidConfigError: If there was an given cloud config. - - Args: - cloud_config: The configuration in clouds.yaml format to apply. - """ - try: - valid_config = _validate_cloud_config(cloud_config) - # TODO: 2024-04-02 - We should define a new error, wrap it and re-raise it. - except OpenStackInvalidConfigError: # pylint: disable=try-except-raise - raise - _write_config_to_disk(valid_config) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py deleted file mode 100644 index ad21f4d97..000000000 --- a/src/openstack_cloud/openstack_cloud.py +++ /dev/null @@ -1,597 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Class for accessing OpenStack API for managing servers.""" - -import logging -from contextlib import contextmanager -from dataclasses import dataclass -from datetime import datetime -from functools import reduce -from pathlib import Path -from typing import Iterable, Iterator, cast - -import openstack -import openstack.exceptions -import paramiko -import yaml -from fabric import Connection as SSHConnection -from openstack.compute.v2.keypair import Keypair as OpenstackKeypair -from openstack.compute.v2.server import Server as OpenstackServer -from openstack.connection import Connection as OpenstackConnection -from openstack.network.v2.security_group import SecurityGroup as OpenstackSecurityGroup -from paramiko.ssh_exception import NoValidConnectionsError - -from errors import KeyfileError, OpenStackError, SSHError -from utilities import retry - -logger = logging.getLogger(__name__) - -_CLOUDS_YAML_PATH = Path.home() / ".config/openstack/clouds.yaml" - -# Update the version when the security group rules are not backward compatible. -_SECURITY_GROUP_NAME = "github-runner-v1" - -_CREATE_SERVER_TIMEOUT = 5 * 60 -_SSH_TIMEOUT = 30 -_SSH_KEY_PATH = Path("/home/ubuntu/.ssh") -_TEST_STRING = "test_string" - - -@dataclass -class OpenstackInstance: - """Represents an OpenStack instance. - - Attributes: - server_id: ID of server assigned by OpenStack. - server_name: Name of the server on OpenStack. - instance_id: ID used by OpenstackCloud class to manage the instances. See docs on the - OpenstackCloud. - addresses: IP addresses assigned to the server. - status: Status of the server. - """ - - server_id: str - server_name: str - instance_id: str - addresses: list[str] - status: str - - def __init__(self, server: OpenstackServer, prefix: str): - """Construct the object. - - Args: - server: The OpenStack server. - prefix: The name prefix for the servers. - - Raises: - ValueError: Provided server should not be managed under this prefix. - """ - self.server_id = server.id - self.server_name = server.name - self.status = server.status - self.addresses = [ - address["addr"] - for network_addresses in server.addresses.values() - for address in network_addresses - ] - - if not self.server_name.startswith(f"{prefix}-"): - # Should never happen. - raise ValueError( - f"Found openstack server {server.name} managed under prefix {prefix}, contact devs" - ) - self.instance_id = self.server_name[len(prefix) + 1 :] - - -@contextmanager -@retry(tries=2, delay=5, local_logger=logger) -def _get_openstack_connection( - clouds_config: dict[str, dict], cloud: str -) -> Iterator[OpenstackConnection]: - """Create a connection context managed object, to be used within with statements. - - The file of _CLOUDS_YAML_PATH should only be modified by this function. - - Args: - clouds_config: The configuration in clouds.yaml format to apply. - cloud: The name of cloud to use in the clouds.yaml. - - Raises: - OpenStackError: if the credentials provided is not authorized. - - Yields: - An openstack.connection.Connection object. - """ - if not _CLOUDS_YAML_PATH.exists(): - _CLOUDS_YAML_PATH.parent.mkdir(parents=True, exist_ok=True) - - # Concurrency: Very small chance for the file to be corrupted due to multiple process calling - # this function and writing the file at the same time. This should cause the `conn.authorize` - # to fail, and retry of this function would resolve this. - _CLOUDS_YAML_PATH.write_text(data=yaml.dump(clouds_config), encoding="utf-8") - - # api documents that keystoneauth1.exceptions.MissingRequiredOptions can be raised but - # I could not reproduce it. Therefore, no catch here for such exception. - try: - with openstack.connect(cloud=cloud) as conn: - conn.authorize() - yield conn - # pylint thinks this isn't an exception, but does inherit from Exception class. - except openstack.exceptions.HttpException as exc: # pylint: disable=bad-exception-cause - logger.exception("OpenStack API call failure") - raise OpenStackError("Failed OpenStack API call") from exc - - -class OpenstackCloud: - """Client to interact with OpenStack cloud. - - The OpenStack server name is managed by this cloud. Caller refers to the instances via - instance_id. If the caller needs the server name, e.g., for logging, it can be queried with - get_server_name. - """ - - def __init__(self, clouds_config: dict[str, dict], cloud: str, prefix: str): - """Create the object. - - Args: - clouds_config: The openstack clouds.yaml in dict format. - cloud: The name of cloud to use in the clouds.yaml. - prefix: Prefix attached to names of resource managed by this instance. Used for - identifying which resource belongs to this instance. - """ - self._clouds_config = clouds_config - self._cloud = cloud - self.prefix = prefix - - # Ignore "Too many arguments" as 6 args should be fine. Move to a dataclass if new args are - # added. - def launch_instance( # pylint: disable=R0913 - self, instance_id: str, image: str, flavor: str, network: str, cloud_init: str - ) -> OpenstackInstance: - """Create an OpenStack instance. - - Args: - instance_id: The instance ID to form the instance name. - image: The image used to create the instance. - flavor: The flavor used to create the instance. - network: The network used to create the instance. - cloud_init: The cloud init userdata to startup the instance. - - Raises: - OpenStackError: Unable to create OpenStack server. - - Returns: - The OpenStack instance created. - """ - full_name = self.get_server_name(instance_id) - logger.info("Creating openstack server with %s", full_name) - - with _get_openstack_connection( - clouds_config=self._clouds_config, cloud=self._cloud - ) as conn: - security_group = OpenstackCloud._ensure_security_group(conn) - keypair = OpenstackCloud._setup_keypair(conn, full_name) - - try: - server = conn.create_server( - name=full_name, - image=image, - key_name=keypair.name, - flavor=flavor, - network=network, - security_groups=[security_group.id], - userdata=cloud_init, - auto_ip=False, - timeout=_CREATE_SERVER_TIMEOUT, - wait=True, - ) - except openstack.exceptions.ResourceTimeout as err: - logger.exception("Timeout creating openstack server %s", full_name) - logger.info( - "Attempting clean up of openstack server %s that timeout during creation", - full_name, - ) - self._delete_instance(conn, full_name) - raise OpenStackError(f"Timeout creating openstack server {full_name}") from err - except openstack.exceptions.SDKException as err: - logger.exception("Failed to create openstack server %s", full_name) - self._delete_keypair(conn, instance_id) - raise OpenStackError(f"Failed to create openstack server {full_name}") from err - - return OpenstackInstance(server, self.prefix) - - def get_instance(self, instance_id: str) -> OpenstackInstance | None: - """Get OpenStack instance by instance ID. - - Args: - instance_id: The instance ID. - - Returns: - The OpenStack instance if found. - """ - full_name = self.get_server_name(instance_id) - logger.info("Getting openstack server with %s", full_name) - - with _get_openstack_connection( - clouds_config=self._clouds_config, cloud=self._cloud - ) as conn: - server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) - if server is not None: - return OpenstackInstance(server, self.prefix) - return None - - def delete_instance(self, instance_id: str) -> None: - """Delete a openstack instance. - - Args: - instance_id: The instance ID of the instance to delete. - """ - full_name = self.get_server_name(instance_id) - logger.info("Deleting openstack server with %s", full_name) - - with _get_openstack_connection( - clouds_config=self._clouds_config, cloud=self._cloud - ) as conn: - self._delete_instance(conn, full_name) - - def _delete_instance(self, conn: OpenstackConnection, full_name: str) -> None: - """Delete a openstack instance. - - Raises: - OpenStackError: Unable to delete OpenStack server. - - Args: - conn: The openstack connection to use. - full_name: The full name of the server. - """ - try: - server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) - if server is not None: - conn.delete_server(name_or_id=server.id) - OpenstackCloud._delete_keypair(conn, full_name) - except ( - openstack.exceptions.SDKException, - openstack.exceptions.ResourceTimeout, - ) as err: - raise OpenStackError(f"Failed to remove openstack runner {full_name}") from err - - def get_ssh_connection(self, instance: OpenstackInstance) -> SSHConnection: - """Get SSH connection to an OpenStack instance. - - Args: - instance: The OpenStack instance to connect to. - - Raises: - SSHError: Unable to get a working SSH connection to the instance. - KeyfileError: Unable to find the keyfile to connect to the instance. - - Returns: - SSH connection object. - """ - key_path = OpenstackCloud._get_key_path(instance.server_name) - - if not key_path.exists(): - raise KeyfileError( - f"Missing keyfile for server: {instance.server_name}, key path: {key_path}" - ) - if not instance.addresses: - raise SSHError(f"No addresses found for OpenStack server {instance.server_name}") - - for ip in instance.addresses: - try: - connection = SSHConnection( - host=ip, - user="ubuntu", - connect_kwargs={"key_filename": str(key_path)}, - connect_timeout=_SSH_TIMEOUT, - ) - result = connection.run(f"echo {_TEST_STRING}", warn=True, timeout=_SSH_TIMEOUT) - if not result.ok: - logger.warning( - "SSH test connection failed, server: %s, address: %s", - instance.server_name, - ip, - ) - continue - if _TEST_STRING in result.stdout: - return connection - except (NoValidConnectionsError, TimeoutError, paramiko.ssh_exception.SSHException): - logger.warning( - "Unable to SSH into %s with address %s", - instance.server_name, - connection.host, - exc_info=True, - ) - continue - raise SSHError( - f"No connectable SSH addresses found, server: {instance.server_name}, " - f"addresses: {instance.addresses}" - ) - - def get_instances(self) -> tuple[OpenstackInstance, ...]: - """Get all OpenStack instances. - - Returns: - The OpenStack instances. - """ - logger.info("Getting all openstack servers managed by the charm") - - with _get_openstack_connection( - clouds_config=self._clouds_config, cloud=self._cloud - ) as conn: - instance_list = self._get_openstack_instances(conn) - server_names = set(server.name for server in instance_list) - - server_list = [ - OpenstackCloud._get_and_ensure_unique_server(conn, name) for name in server_names - ] - return tuple( - OpenstackInstance(server, self.prefix) - for server in server_list - if server is not None - ) - - def cleanup(self) -> None: - """Cleanup unused key files and openstack keypairs.""" - with _get_openstack_connection( - clouds_config=self._clouds_config, cloud=self._cloud - ) as conn: - instances = self._get_openstack_instances(conn) - exclude_list = [server.name for server in instances] - self._cleanup_key_files(exclude_list) - self._cleanup_openstack_keypairs(conn, exclude_list) - - def get_server_name(self, instance_id: str) -> str: - """Get server name on OpenStack. - - Args: - instance_id: ID used to identify a instance. - - Returns: - The OpenStack server name. - """ - return f"{self.prefix}-{instance_id}" - - def _cleanup_key_files(self, exclude_instances: Iterable[str]) -> None: - """Delete all SSH key files except the specified instances. - - Args: - exclude_instances: The keys of these instance will not be deleted. - """ - logger.info("Cleaning up SSH key files") - exclude_filename = set( - OpenstackCloud._get_key_path(instance) for instance in exclude_instances - ) - - total = 0 - deleted = 0 - for path in _SSH_KEY_PATH.iterdir(): - # Find key file from this application. - if path.is_file() and path.name.startswith(self.prefix) and path.name.endswith(".key"): - total += 1 - if path in exclude_filename: - continue - path.unlink() - deleted += 1 - logger.info("Found %s key files, clean up %s key files", total, deleted) - - def _cleanup_openstack_keypairs( - self, conn: OpenstackConnection, exclude_instances: Iterable[str] - ) -> None: - """Delete all OpenStack keypairs except the specified instances. - - Args: - conn: The Openstack connection instance. - exclude_instances: The keys of these instance will not be deleted. - """ - logger.info("Cleaning up openstack keypairs") - exclude_instance_set = set(exclude_instances) - keypairs = conn.list_keypairs() - for key in keypairs: - # The `name` attribute is of resource.Body type. - if key.name and str(key.name).startswith(self.prefix): - if str(key.name) in exclude_instance_set: - continue - try: - self._delete_keypair(conn, key.name) - except openstack.exceptions.SDKException: - logger.warning( - "Unable to delete OpenStack keypair associated with deleted key file %s ", - key.name, - ) - - def _get_openstack_instances(self, conn: OpenstackConnection) -> tuple[OpenstackServer, ...]: - """Get the OpenStack servers managed by this unit. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - List of OpenStack instances. - """ - return tuple( - server - for server in cast(list[OpenstackServer], conn.list_servers()) - if server.name.startswith(f"{self.prefix}-") - ) - - @staticmethod - def _get_and_ensure_unique_server( - conn: OpenstackConnection, name: str - ) -> OpenstackServer | None: - """Get the latest server of the name and ensure it is unique. - - If multiple servers with the same name are found, the latest server in creation time is - returned. Other servers is deleted. - - Args: - conn: The connection to OpenStack. - name: The name of the OpenStack name. - - Returns: - A server with the name. - """ - servers: list[OpenstackServer] = conn.search_servers(name) - - if not servers: - return None - - # 2024/08/14: The `format` arg for `strptime` is the default format. - # This is only provided to get around a bug of the function with type checking. - latest_server = reduce( - lambda a, b: ( - a - if datetime.strptime(a.created_at, "a %b %d %H:%M:%S %Y") - < datetime.strptime(b.create_at, "a %b %d %H:%M:%S %Y") - else b - ), - servers, - ) - outdated_servers = filter(lambda x: x != latest_server, servers) - for server in outdated_servers: - try: - conn.delete_server(name_or_id=server.id) - except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout): - logger.warning( - "Unable to delete server with duplicate name %s with ID %s", - name, - server.id, - stack_info=True, - ) - - return latest_server - - @staticmethod - def _get_key_path(name: str) -> Path: - """Get the filepath for storing private SSH of a runner. - - Args: - name: The name of the runner. - - Returns: - Path to reserved for the key file of the runner. - """ - return _SSH_KEY_PATH / f"{name}.key" - - @staticmethod - def _setup_keypair(conn: OpenstackConnection, name: str) -> OpenstackKeypair: - """Create OpenStack keypair. - - Args: - conn: The connection object to access OpenStack cloud. - name: The name of the keypair. - - Returns: - The OpenStack keypair. - """ - key_path = OpenstackCloud._get_key_path(name) - - if key_path.exists(): - logger.warning("Existing private key file for %s found, removing it.", name) - key_path.unlink(missing_ok=True) - - keypair = conn.create_keypair(name=name) - key_path.parent.mkdir(parents=True, exist_ok=True) - key_path.write_text(keypair.private_key) - key_path.chmod(0o400) - return keypair - - @staticmethod - def _delete_keypair(conn: OpenstackConnection, name: str) -> None: - """Delete OpenStack keypair. - - Args: - conn: The connection object to access OpenStack cloud. - name: The name of the keypair. - """ - try: - # Keypair have unique names, access by ID is not needed. - if not conn.delete_keypair(name): - logger.warning("Unable to delete keypair for %s", name) - except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout): - logger.warning("Unable to delete keypair for %s", name, stack_info=True) - - key_path = OpenstackCloud._get_key_path(name) - key_path.unlink(missing_ok=True) - - @staticmethod - def _ensure_security_group(conn: OpenstackConnection) -> OpenstackSecurityGroup: - """Ensure runner security group exists. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - The security group with the rules for runners. - """ - rule_exists_icmp = False - rule_exists_ssh = False - rule_exists_tmate_ssh = False - - security_group_list = conn.list_security_groups(filters={"name": _SECURITY_GROUP_NAME}) - # Pick the first security_group returned. - security_group = next(iter(security_group_list), None) - if security_group is None: - logger.info("Security group %s not found, creating it", _SECURITY_GROUP_NAME) - security_group = conn.create_security_group( - name=_SECURITY_GROUP_NAME, - description="For servers managed by the github-runner charm.", - ) - else: - existing_rules = security_group.security_group_rules - for rule in existing_rules: - if rule["protocol"] == "icmp": - logger.debug( - "Found ICMP rule in existing security group %s of ID %s", - _SECURITY_GROUP_NAME, - security_group.id, - ) - rule_exists_icmp = True - if ( - rule["protocol"] == "tcp" - and rule["port_range_min"] == rule["port_range_max"] == 22 - ): - logger.debug( - "Found SSH rule in existing security group %s of ID %s", - _SECURITY_GROUP_NAME, - security_group.id, - ) - rule_exists_ssh = True - if ( - rule["protocol"] == "tcp" - and rule["port_range_min"] == rule["port_range_max"] == 10022 - ): - logger.debug( - "Found tmate SSH rule in existing security group %s of ID %s", - _SECURITY_GROUP_NAME, - security_group.id, - ) - rule_exists_tmate_ssh = True - - if not rule_exists_icmp: - conn.create_security_group_rule( - secgroup_name_or_id=security_group.id, - protocol="icmp", - direction="ingress", - ethertype="IPv4", - ) - if not rule_exists_ssh: - conn.create_security_group_rule( - secgroup_name_or_id=security_group.id, - port_range_min="22", - port_range_max="22", - protocol="tcp", - direction="ingress", - ethertype="IPv4", - ) - if not rule_exists_tmate_ssh: - conn.create_security_group_rule( - secgroup_name_or_id=security_group.id, - port_range_min="10022", - port_range_max="10022", - protocol="tcp", - direction="egress", - ethertype="IPv4", - ) - return security_group diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py deleted file mode 100644 index 11bac0b92..000000000 --- a/src/openstack_cloud/openstack_runner_manager.py +++ /dev/null @@ -1,830 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Manager for self-hosted runner on OpenStack.""" - -import logging -import secrets -import time -from dataclasses import dataclass -from pathlib import Path -from typing import Iterator, Sequence - -import invoke -import jinja2 -import paramiko -import paramiko.ssh_exception -from fabric import Connection as SSHConnection - -from charm_state import GitHubOrg -from errors import ( - CreateMetricsStorageError, - GetMetricsStorageError, - IssueMetricEventError, - KeyfileError, - MissingServerConfigError, - OpenStackError, - RunnerCreateError, - RunnerStartError, - SSHError, -) -from manager.cloud_runner_manager import ( - CloudRunnerInstance, - CloudRunnerManager, - CloudRunnerState, - GitHubRunnerConfig, - InstanceId, - SupportServiceConfig, -) -from manager.runner_manager import HealthState -from metrics import events as metric_events -from metrics import runner as runner_metrics -from metrics import storage as metrics_storage -from openstack_cloud.openstack_cloud import OpenstackCloud, OpenstackInstance -from repo_policy_compliance_client import RepoPolicyComplianceClient -from utilities import retry, set_env_var - -logger = logging.getLogger(__name__) - -BUILD_OPENSTACK_IMAGE_SCRIPT_FILENAME = "scripts/build-openstack-image.sh" -_CONFIG_SCRIPT_PATH = Path("/home/ubuntu/actions-runner/config.sh") - -RUNNER_APPLICATION = Path("/home/ubuntu/actions-runner") -METRICS_EXCHANGE_PATH = Path("/home/ubuntu/metrics-exchange") -PRE_JOB_SCRIPT = RUNNER_APPLICATION / "pre-job.sh" -MAX_METRICS_FILE_SIZE = 1024 - -RUNNER_STARTUP_PROCESS = "/home/ubuntu/actions-runner/run.sh" -RUNNER_LISTENER_PROCESS = "Runner.Listener" -RUNNER_WORKER_PROCESS = "Runner.Worker" -CREATE_SERVER_TIMEOUT = 5 * 60 - - -class _GithubRunnerRemoveError(Exception): - """Represents an error while SSH into a runner and running the remove script.""" - - -class _PullFileError(Exception): - """Represents an error while pulling a file from the runner instance.""" - - -@dataclass -class OpenStackCloudConfig: - """Configuration for OpenStack cloud authorisation information. - - Attributes: - clouds_config: The clouds.yaml. - cloud: The cloud name to connect to. - """ - - clouds_config: dict[str, dict] - cloud: str - - -@dataclass -class OpenStackServerConfig: - """Configuration for OpenStack server. - - Attributes: - image: The image name for runners to use. - flavor: The flavor name for runners to use. - network: The network name for runners to use. - """ - - image: str - flavor: str - network: str - - -@dataclass -class _RunnerHealth: - """Runners with health state. - - Attributes: - healthy: The list of healthy runners. - unhealthy: The list of unhealthy runners. - """ - - healthy: tuple[OpenstackInstance, ...] - unhealthy: tuple[OpenstackInstance, ...] - - -class OpenStackRunnerManager(CloudRunnerManager): - """Manage self-hosted runner on OpenStack cloud. - - Attributes: - name_prefix: The name prefix of the runners created. - """ - - # Ignore "Too many arguments", as the class requires a lot of configurations. - def __init__( # pylint: disable=R0913 - self, - manager_name: str, - prefix: str, - cloud_config: OpenStackCloudConfig, - server_config: OpenStackServerConfig | None, - runner_config: GitHubRunnerConfig, - service_config: SupportServiceConfig, - ) -> None: - """Construct the object. - - Args: - manager_name: A name to identify this manager. - prefix: The prefix to runner name. - cloud_config: The configuration for OpenStack authorisation. - server_config: The configuration for creating OpenStack server. Unable to create - runner if None. - runner_config: The configuration for the runner. - service_config: The configuration of supporting services of the runners. - """ - self._manager_name = manager_name - self._prefix = prefix - self._cloud_config = cloud_config - self._server_config = server_config - self._runner_config = runner_config - self._service_config = service_config - self._openstack_cloud = OpenstackCloud( - clouds_config=self._cloud_config.clouds_config, - cloud=self._cloud_config.cloud, - prefix=self.name_prefix, - ) - - # Setting the env var to this process and any child process spawned. - proxies = service_config.proxy_config - if no_proxy := proxies.no_proxy: - set_env_var("NO_PROXY", no_proxy) - if http_proxy := proxies.http: - set_env_var("HTTP_PROXY", http_proxy) - if https_proxy := proxies.https: - set_env_var("HTTPS_PROXY", https_proxy) - - @property - def name_prefix(self) -> str: - """The prefix of runner names. - - Returns: - The prefix of the runner names managed by this class. - """ - return self._prefix - - def create_runner(self, registration_token: str) -> InstanceId: - """Create a self-hosted runner. - - Args: - registration_token: The GitHub registration token for registering runners. - - Raises: - MissingServerConfigError: Unable to create runner due to missing configuration. - RunnerCreateError: Unable to create runner due to OpenStack issues. - - Returns: - Instance ID of the runner. - """ - if self._server_config is None: - raise MissingServerConfigError("Missing server configuration to create runners") - - start_timestamp = time.time() - instance_id = OpenStackRunnerManager._generate_instance_id() - instance_name = self._openstack_cloud.get_server_name(instance_id=instance_id) - cloud_init = self._generate_cloud_init( - instance_name=instance_name, registration_token=registration_token - ) - try: - instance = self._openstack_cloud.launch_instance( - instance_id=instance_id, - image=self._server_config.image, - flavor=self._server_config.flavor, - network=self._server_config.network, - cloud_init=cloud_init, - ) - except OpenStackError as err: - raise RunnerCreateError(f"Failed to create {instance_name} openstack runner") from err - - self._wait_runner_startup(instance) - self._wait_runner_running(instance) - - end_timestamp = time.time() - OpenStackRunnerManager._issue_runner_installed_metric( - name=instance_name, - flavor=self._manager_name, - install_start_timestamp=start_timestamp, - install_end_timestamp=end_timestamp, - ) - return instance_id - - def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: - """Get a self-hosted runner by instance id. - - Args: - instance_id: The instance id. - - Returns: - Information on the runner instance. - """ - instance = self._openstack_cloud.get_instance(instance_id) - healthy = self._runner_health_check(instance=instance) - return ( - CloudRunnerInstance( - name=instance.server_name, - instance_id=instance_id, - health=HealthState.HEALTHY if healthy else HealthState.UNHEALTHY, - state=CloudRunnerState.from_openstack_server_status(instance.status), - ) - if instance is not None - else None - ) - - def get_runners( - self, states: Sequence[CloudRunnerState] | None = None - ) -> tuple[CloudRunnerInstance, ...]: - """Get self-hosted runners by state. - - Args: - states: Filter for the runners with these github states. If None all states will be - included. - - Returns: - Information on the runner instances. - """ - instance_list = self._openstack_cloud.get_instances() - instance_list = [ - CloudRunnerInstance( - name=instance.server_name, - instance_id=instance.instance_id, - health=( - HealthState.HEALTHY - if self._runner_health_check(instance) - else HealthState.UNHEALTHY - ), - state=CloudRunnerState.from_openstack_server_status(instance.status), - ) - for instance in instance_list - ] - if states is None: - return tuple(instance_list) - - state_set = set(states) - return tuple(instance for instance in instance_list if instance.state in state_set) - - def delete_runner( - self, instance_id: InstanceId, remove_token: str - ) -> runner_metrics.RunnerMetrics | None: - """Delete self-hosted runners. - - Args: - instance_id: The instance id of the runner to delete. - remove_token: The GitHub remove token. - - Returns: - Any metrics collected during the deletion of the runner. - """ - instance = self._openstack_cloud.get_instance(instance_id) - if instance is None: - logger.warning( - "Unable to delete instance %s as it is not found", - self._openstack_cloud.get_server_name(instance_id), - ) - return None - - extracted_metrics = runner_metrics.extract( - metrics_storage_manager=metrics_storage, runners=set([instance.server_name]) - ) - self._delete_runner(instance, remove_token) - return next(extracted_metrics, None) - - def flush_runners( - self, remove_token: str, busy: bool = False - ) -> Iterator[runner_metrics.RunnerMetrics]: - """Remove idle and/or busy runners. - - Args: - remove_token: - busy: If false, only idle runners are removed. If true, both idle and busy runners are - removed. - - Returns: - Any metrics retrieved from flushed runners. - """ - instance_list = self._openstack_cloud.get_instances() - for instance in instance_list: - try: - self._check_state_and_flush(instance, busy) - except SSHError: - logger.warning( - "Unable to determine state of %s and kill runner process due to SSH issues", - instance.server_name, - ) - continue - return self.cleanup(remove_token) - - def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]: - """Cleanup runner and resource on the cloud. - - Args: - remove_token: The GitHub remove token. - - Returns: - Any metrics retrieved from cleanup runners. - """ - runners = self._get_runners_health() - healthy_runner_names = [runner.server_name for runner in runners.healthy] - metrics = runner_metrics.extract( - metrics_storage_manager=metrics_storage, runners=set(healthy_runner_names) - ) - for runner in runners.unhealthy: - self._delete_runner(runner, remove_token) - - self._openstack_cloud.cleanup() - return metrics - - def _delete_runner(self, instance: OpenstackInstance, remove_token: str) -> None: - """Delete self-hosted runners by openstack instance. - - Args: - instance: The OpenStack instance. - remove_token: The GitHub remove token. - """ - try: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - self._pull_runner_metrics(instance.server_name, ssh_conn) - - try: - OpenStackRunnerManager._run_runner_removal_script( - instance.server_name, ssh_conn, remove_token - ) - except _GithubRunnerRemoveError: - logger.warning( - "Unable to run github runner removal script for %s", - instance.server_name, - stack_info=True, - ) - except SSHError: - logger.exception( - "Failed to get SSH connection while removing %s", instance.server_name - ) - logger.warning( - "Skipping runner remove script for %s due to SSH issues", instance.server_name - ) - - try: - self._openstack_cloud.delete_instance(instance.instance_id) - except OpenStackError: - logger.exception( - "Unable to delete openstack instance for runner %s", instance.server_name - ) - - def _get_runners_health(self) -> _RunnerHealth: - """Get runners by health state. - - Returns: - Runners by health state. - """ - runner_list = self._openstack_cloud.get_instances() - - healthy, unhealthy = [], [] - for runner in runner_list: - if self._runner_health_check(runner): - healthy.append(runner) - else: - unhealthy.append(runner) - return _RunnerHealth(healthy=tuple(healthy), unhealthy=tuple(unhealthy)) - - def _runner_health_check(self, instance: OpenstackInstance) -> bool: - """Run health check on a runner. - - Args: - instance: The instance hosting the runner to run health check on. - - Returns: - True if runner is healthy. - """ - cloud_state = CloudRunnerState.from_openstack_server_status(instance.status) - return cloud_state not in set( - ( - CloudRunnerState.DELETED, - CloudRunnerState.ERROR, - CloudRunnerState.STOPPED, - ) - ) and self._health_check(instance) - - def _generate_cloud_init(self, instance_name: str, registration_token: str) -> str: - """Generate cloud init userdata. - - This is the script the openstack server runs on startup. - - Args: - instance_name: The name of the instance. - registration_token: The GitHub runner registration token. - - Returns: - The cloud init userdata for openstack instance. - """ - jinja = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"), autoescape=True) - - env_contents = jinja.get_template("env.j2").render( - pre_job_script=str(PRE_JOB_SCRIPT), - dockerhub_mirror=self._service_config.dockerhub_mirror or "", - ssh_debug_info=( - secrets.choice(self._service_config.ssh_debug_connections) - if self._service_config.ssh_debug_connections - else None - ), - ) - - pre_job_contents_dict = { - "issue_metrics": True, - "metrics_exchange_path": str(METRICS_EXCHANGE_PATH), - "do_repo_policy_check": False, - } - repo_policy = self._get_repo_policy_compliance_client() - if repo_policy is not None: - pre_job_contents_dict.update( - { - "repo_policy_base_url": repo_policy.base_url, - "repo_policy_one_time_token": repo_policy.get_one_time_token(), - "do_repo_policy_check": True, - } - ) - - pre_job_contents = jinja.get_template("pre-job.j2").render(pre_job_contents_dict) - - runner_group = None - if isinstance(self._runner_config.github_path, GitHubOrg): - runner_group = self._runner_config.github_path.group - aproxy_address = ( - self._service_config.proxy_config.aproxy_address - if self._service_config.proxy_config is not None - else None - ) - return jinja.get_template("openstack-userdata.sh.j2").render( - github_url=f"https://github.com/{self._runner_config.github_path.path()}", - runner_group=runner_group, - token=registration_token, - instance_labels=",".join(self._runner_config.labels), - instance_name=instance_name, - env_contents=env_contents, - pre_job_contents=pre_job_contents, - metrics_exchange_path=str(METRICS_EXCHANGE_PATH), - aproxy_address=aproxy_address, - dockerhub_mirror=self._service_config.dockerhub_mirror, - ) - - def _get_repo_policy_compliance_client(self) -> RepoPolicyComplianceClient | None: - """Get repo policy compliance client. - - Returns: - The repo policy compliance client. - """ - if self._service_config.repo_policy_compliance is not None: - return RepoPolicyComplianceClient( - self._service_config.repo_policy_compliance.url, - self._service_config.repo_policy_compliance.token, - ) - return None - - @retry(tries=3, delay=5, backoff=2, local_logger=logger) - def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> None: - """Kill runner process depending on idle or busy. - - Due to update to runner state has some delay with GitHub API. The state of the runner is - determined by which runner processes are running. If the Runner.Worker process is running, - the runner is deemed to be busy. - - Raises: - SSHError: Unable to check the state of the runner and kill the runner process due to - SSH failure. - - Args: - instance: The openstack instance to kill the runner process. - busy: Kill the process if runner is busy, else only kill runner - process if runner is idle. - """ - try: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - except KeyfileError: - logger.exception( - "Health check failed due to unable to find keyfile for %s", instance.server_name - ) - return - except SSHError: - logger.exception( - "SSH connection failure with %s during flushing", instance.server_name - ) - raise - - # Using a single command to determine the state and kill the process if needed. - # This makes it more robust when network is unstable. - if busy: - logger.info("Attempting to kill all runner process on %s", instance.server_name) - # kill both Runner.Listener and Runner.Worker processes. - # This kills pre-job.sh, a child process of Runner.Worker. - kill_command = ( - f"pgrep -x {RUNNER_LISTENER_PROCESS} && kill $(pgrep -x {RUNNER_LISTENER_PROCESS});" - f"pgrep -x {RUNNER_WORKER_PROCESS} && kill $(pgrep -x {RUNNER_WORKER_PROCESS});" - ) - else: - logger.info( - "Attempting to kill runner process on %s if not busy", instance.server_name - ) - # Only kill Runner.Listener if Runner.Worker does not exist. - kill_command = ( - f"pgrep -x {RUNNER_WORKER_PROCESS} || pgrep -x {RUNNER_LISTENER_PROCESS} && " - f"kill $(pgrep -x {RUNNER_LISTENER_PROCESS})" - ) - # Checking the result of kill command is not useful, as the exit code does not reveal much. - ssh_conn.run(kill_command, warn=True) - - @retry(tries=3, delay=5, backoff=2, local_logger=logger) - def _health_check(self, instance: OpenstackInstance) -> bool: - """Check whether runner is healthy. - - Args: - instance: The OpenStack instance to conduit the health check. - - Raises: - SSHError: Unable to get a SSH connection to the instance. - - Returns: - Whether the runner is healthy. - """ - try: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - except KeyfileError: - logger.exception( - "Health check failed due to unable to find keyfile for %s", instance.server_name - ) - return False - except SSHError: - logger.exception( - "SSH connection failure with %s during health check", instance.server_name - ) - raise - return OpenStackRunnerManager._run_health_check(ssh_conn, instance.server_name) - - @staticmethod - def _run_health_check(ssh_conn: SSHConnection, name: str) -> bool: - """Run a health check for runner process. - - Args: - ssh_conn: The SSH connection to the runner. - name: The name of the runner. - - Returns: - Whether the health succeed. - """ - result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) - if not result.ok: - logger.warning("SSH run of `ps aux` failed on %s: %s", name, result.stderr) - return False - if ( - RUNNER_WORKER_PROCESS not in result.stdout - and RUNNER_LISTENER_PROCESS not in result.stdout - ): - logger.warning("Runner process not found on %s", name) - return False - return True - - @retry(tries=10, delay=60, local_logger=logger) - def _wait_runner_startup(self, instance: OpenstackInstance) -> None: - """Wait until runner is startup. - - Args: - instance: The runner instance. - - Raises: - RunnerStartError: The runner startup process was not found on the runner. - """ - try: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - except SSHError as err: - raise RunnerStartError( - f"Failed to SSH to {instance.server_name} during creation possible due to setup " - "not completed" - ) from err - - result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) - if not result.ok: - logger.warning("SSH run of `ps aux` failed on %s", instance.server_name) - raise RunnerStartError(f"Unable to SSH run `ps aux` on {instance.server_name}") - if RUNNER_STARTUP_PROCESS not in result.stdout: - logger.warning("Runner startup process not found on %s", instance.server_name) - raise RunnerStartError(f"Runner startup process not found on {instance.server_name}") - logger.info("Runner startup process found to be healthy on %s", instance.server_name) - - @retry(tries=5, delay=60, local_logger=logger) - def _wait_runner_running(self, instance: OpenstackInstance) -> None: - """Wait until runner is running. - - Args: - instance: The runner instance. - - Raises: - RunnerStartError: The runner process was not found on the runner. - """ - try: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - except SSHError as err: - raise RunnerStartError( - f"Failed to SSH connect to {instance.server_name} openstack runner" - ) from err - - if not self._run_health_check(ssh_conn=ssh_conn, name=instance.server_name): - logger.info("Runner process not found on %s", instance.server_name) - raise RunnerStartError( - f"Runner process on {instance.server_name} failed to initialize on after starting" - ) - - logger.info("Runner process found to be healthy on %s", instance.server_name) - - @staticmethod - def _generate_instance_id() -> InstanceId: - """Generate a instance id. - - Return: - The id. - """ - return secrets.token_hex(12) - - @staticmethod - def _issue_runner_installed_metric( - name: str, - flavor: str, - install_start_timestamp: float, - install_end_timestamp: float, - ) -> None: - """Issue metric for runner installed event. - - Args: - name: The name of the runner. - flavor: The flavor of the runner. - install_start_timestamp: The timestamp of installation start. - install_end_timestamp: The timestamp of installation end. - """ - try: - metric_events.issue_event( - event=metric_events.RunnerInstalled( - timestamp=install_start_timestamp, - flavor=flavor, - duration=install_end_timestamp - install_start_timestamp, - ) - ) - except IssueMetricEventError: - logger.exception("Failed to issue RunnerInstalled metric") - - try: - storage = metrics_storage.create(name) - except CreateMetricsStorageError: - logger.exception( - "Failed to create metrics storage for runner %s, " - "will not be able to issue all metrics.", - name, - ) - else: - try: - (storage.path / runner_metrics.RUNNER_INSTALLED_TS_FILE_NAME).write_text( - str(install_end_timestamp), encoding="utf-8" - ) - except FileNotFoundError: - logger.exception( - "Failed to write runner-installed.timestamp into metrics storage " - "for runner %s, will not be able to issue all metrics.", - name, - ) - - @staticmethod - def _pull_runner_metrics(name: str, ssh_conn: SSHConnection) -> None: - """Pull metrics from runner. - - Args: - name: The name of the runner. - ssh_conn: The SSH connection to the runner. - """ - try: - storage = metrics_storage.get(name) - except GetMetricsStorageError: - logger.exception( - "Failed to get shared metrics storage for runner %s, " - "will not be able to issue all metrics.", - name, - ) - return - - try: - OpenStackRunnerManager._ssh_pull_file( - ssh_conn=ssh_conn, - remote_path=str(METRICS_EXCHANGE_PATH / "pre-job-metrics.json"), - local_path=str(storage.path / "pre-job-metrics.json"), - max_size=MAX_METRICS_FILE_SIZE, - ) - OpenStackRunnerManager._ssh_pull_file( - ssh_conn=ssh_conn, - remote_path=str(METRICS_EXCHANGE_PATH / "post-job-metrics.json"), - local_path=str(storage.path / "post-job-metrics.json"), - max_size=MAX_METRICS_FILE_SIZE, - ) - except _PullFileError as exc: - logger.warning( - "Failed to pull metrics for %s: %s . Will not be able to issue all metrics", - name, - exc, - ) - - @staticmethod - def _ssh_pull_file( - ssh_conn: SSHConnection, remote_path: str, local_path: str, max_size: int - ) -> None: - """Pull file from the runner instance. - - Args: - ssh_conn: The SSH connection instance. - remote_path: The file path on the runner instance. - local_path: The local path to store the file. - max_size: If the file is larger than this, it will not be pulled. - - Raises: - _PullFileError: Unable to pull the file from the runner instance. - SSHError: Issue with SSH connection. - """ - try: - result = ssh_conn.run(f"stat -c %s {remote_path}", warn=True) - except ( - TimeoutError, - paramiko.ssh_exception.NoValidConnectionsError, - paramiko.ssh_exception.SSHException, - ) as exc: - raise SSHError(f"Unable to SSH into {ssh_conn.host}") from exc - if not result.ok: - logger.warning( - ( - "Unable to get file size of %s on instance %s, " - "exit code: %s, stdout: %s, stderr: %s" - ), - remote_path, - ssh_conn.host, - result.return_code, - result.stdout, - result.stderr, - ) - raise _PullFileError(f"Unable to get file size of {remote_path}") - - stdout = result.stdout - try: - stdout.strip() - size = int(stdout) - if size > max_size: - raise _PullFileError(f"File size of {remote_path} too large {size} > {max_size}") - except ValueError as exc: - raise _PullFileError(f"Invalid file size for {remote_path}: stdout") from exc - - try: - ssh_conn.get(remote=remote_path, local=local_path) - except ( - TimeoutError, - paramiko.ssh_exception.NoValidConnectionsError, - paramiko.ssh_exception.SSHException, - ) as exc: - raise SSHError(f"Unable to SSH into {ssh_conn.host}") from exc - except OSError as exc: - raise _PullFileError(f"Unable to retrieve file {remote_path}") from exc - - @staticmethod - def _run_runner_removal_script( - instance_name: str, ssh_conn: SSHConnection, remove_token: str - ) -> None: - """Run Github runner removal script. - - Args: - instance_name: The name of the runner instance. - ssh_conn: The SSH connection to the runner instance. - remove_token: The GitHub instance removal token. - - Raises: - _GithubRunnerRemoveError: Unable to remove runner from GitHub. - """ - try: - result = ssh_conn.run( - f"{_CONFIG_SCRIPT_PATH} remove --token {remove_token}", - warn=True, - ) - if result.ok: - return - - logger.warning( - ( - "Unable to run removal script on instance %s, " - "exit code: %s, stdout: %s, stderr: %s" - ), - instance_name, - result.return_code, - result.stdout, - result.stderr, - ) - raise _GithubRunnerRemoveError(f"Failed to remove runner {instance_name} from Github.") - except ( - TimeoutError, - paramiko.ssh_exception.NoValidConnectionsError, - paramiko.ssh_exception.SSHException, - ) as exc: - raise _GithubRunnerRemoveError( - f"Failed to remove runner {instance_name} from Github." - ) from exc diff --git a/src/reactive/__init__.py b/src/reactive/__init__.py deleted file mode 100644 index 1c7b82dda..000000000 --- a/src/reactive/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Package for code implementing reactive scheduling.""" diff --git a/src/reactive/consumer.py b/src/reactive/consumer.py deleted file mode 100644 index f868feddd..000000000 --- a/src/reactive/consumer.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Module responsible for consuming jobs from the message queue.""" -import contextlib -import logging -import signal -import sys -from contextlib import closing -from types import FrameType -from typing import Generator, cast - -from kombu import Connection -from kombu.simple import SimpleQueue -from pydantic import BaseModel, HttpUrl, ValidationError - -logger = logging.getLogger(__name__) - - -class JobDetails(BaseModel): - """A class to translate the payload. - - Attributes: - labels: The labels of the job. - run_url: The URL of the job. - """ - - labels: list[str] - run_url: HttpUrl - - -class JobError(Exception): - """Raised when a job error occurs.""" - - -def consume(mongodb_uri: str, queue_name: str) -> None: - """Consume a job from the message queue. - - Log the job details and acknowledge the message. - If the job details are invalid, reject the message and raise an error. - - Args: - mongodb_uri: The URI of the MongoDB database. - queue_name: The name of the queue. - - Raises: - JobError: If the job details are invalid. - """ - with Connection(mongodb_uri) as conn: - with closing(SimpleQueue(conn, queue_name)) as simple_queue: - with signal_handler(signal.SIGTERM): - msg = simple_queue.get(block=True) - try: - job_details = cast(JobDetails, JobDetails.parse_raw(msg.payload)) - except ValidationError as exc: - msg.reject(requeue=True) - raise JobError(f"Invalid job details: {msg.payload}") from exc - logger.info( - "Received job with labels %s and run_url %s", - job_details.labels, - job_details.run_url, - ) - msg.ack() - - -@contextlib.contextmanager -def signal_handler(signal_code: signal.Signals) -> Generator[None, None, None]: - """Set a signal handler and after the context, restore the default handler. - - The signal handler exits the process. - - Args: - signal_code: The signal code to handle. - """ - _set_signal_handler(signal_code) - try: - yield - finally: - _restore_signal_handler(signal_code) - - -def _set_signal_handler(signal_code: signal.Signals) -> None: - """Set a signal handler which exits the process. - - Args: - signal_code: The signal code to handle. - """ - - def sigterm_handler(signal_code: int, _: FrameType | None) -> None: - """Handle a signal. - - Call sys.exit with the signal code. Kombu should automatically - requeue unacknowledged messages. - - Args: - signal_code: The signal code to handle. - """ - print( - f"Signal '{signal.strsignal(signal_code)}' received. Will terminate.", file=sys.stderr - ) - sys.exit(signal_code) - - signal.signal(signal_code, sigterm_handler) - - -def _restore_signal_handler(signal_code: signal.Signals) -> None: - """Restore the default signal handler. - - Args: - signal_code: The signal code to restore. - """ - signal.signal(signal_code, signal.SIG_DFL) diff --git a/src/reactive/runner_manager.py b/src/reactive/runner_manager.py deleted file mode 100644 index 5799731ee..000000000 --- a/src/reactive/runner_manager.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Module for managing reactive runners.""" -import logging -import os -import shutil -import signal - -# All commands run by subprocess are secure. -import subprocess # nosec -from pathlib import Path - -from utilities import secure_run_subprocess - -logger = logging.getLogger(__name__) - -MQ_URI_ENV_VAR = "MQ_URI" -QUEUE_NAME_ENV_VAR = "QUEUE_NAME" -REACTIVE_RUNNER_LOG_DIR = Path("/var/log/reactive_runner") -REACTIVE_RUNNER_SCRIPT_FILE = "scripts/reactive_runner.py" -PYTHON_BIN = "/usr/bin/python3" -REACTIVE_RUNNER_CMD_LINE_PREFIX = f"{PYTHON_BIN} {REACTIVE_RUNNER_SCRIPT_FILE}" -PID_CMD_COLUMN_WIDTH = len(REACTIVE_RUNNER_CMD_LINE_PREFIX) -PIDS_COMMAND_LINE = [ - "ps", - "axo", - f"cmd:{PID_CMD_COLUMN_WIDTH},pid", - "--no-headers", - "--sort=-start_time", -] -UBUNTU_USER = "ubuntu" - - -class ReactiveRunnerError(Exception): - """Raised when a reactive runner error occurs.""" - - -def reconcile(quantity: int, mq_uri: str, queue_name: str) -> int: - """Spawn a runner reactively. - - Args: - quantity: The number of runners to spawn. - mq_uri: The message queue URI. - queue_name: The name of the queue. - - Raises a ReactiveRunnerError if the runner fails to spawn. - - Returns: - The number of reactive runner processes spawned. - """ - pids = _get_pids() - current_quantity = len(pids) - logger.info("Current quantity of reactive runner processes: %s", current_quantity) - delta = quantity - current_quantity - if delta > 0: - logger.info("Will spawn %d new reactive runner process(es)", delta) - _setup_logging_for_processes() - for _ in range(delta): - _spawn_runner(mq_uri=mq_uri, queue_name=queue_name) - elif delta < 0: - logger.info("Will kill %d process(es).", -delta) - for pid in pids[:-delta]: - logger.info("Killing reactive runner process with pid %s", pid) - try: - os.kill(pid, signal.SIGTERM) - except ProcessLookupError: - # There can be a race condition that the process has already terminated. - # We just ignore and log the fact. - logger.info( - "Failed to kill process with pid %s. Process might have terminated it self.", - pid, - ) - else: - logger.info("No changes to number of reactive runner processes needed.") - - return delta - - -def _get_pids() -> list[int]: - """Get the PIDs of the reactive runners processes. - - Returns: - The PIDs of the reactive runner processes sorted by start time in descending order. - - Raises: - ReactiveRunnerError: If the command to get the PIDs fails - """ - result = secure_run_subprocess(cmd=PIDS_COMMAND_LINE) - if result.returncode != 0: - raise ReactiveRunnerError("Failed to get list of processes") - - return [ - int(line.rstrip().rsplit(maxsplit=1)[-1]) - for line in result.stdout.decode().split("\n") - if line.startswith(REACTIVE_RUNNER_CMD_LINE_PREFIX) - ] - - -def _setup_logging_for_processes() -> None: - """Set up the log dir.""" - if not REACTIVE_RUNNER_LOG_DIR.exists(): - REACTIVE_RUNNER_LOG_DIR.mkdir() - shutil.chown(REACTIVE_RUNNER_LOG_DIR, user=UBUNTU_USER, group=UBUNTU_USER) - - -def _spawn_runner(mq_uri: str, queue_name: str) -> None: - """Spawn a runner. - - Args: - mq_uri: The message queue URI. - queue_name: The name of the queue. - """ - env = { - "PYTHONPATH": "src:lib:venv", - MQ_URI_ENV_VAR: mq_uri, - QUEUE_NAME_ENV_VAR: queue_name, - } - # We do not want to wait for the process to finish, so we do not use with statement. - # We trust the command. - command = " ".join( - [ - PYTHON_BIN, - REACTIVE_RUNNER_SCRIPT_FILE, - ">>", - # $$ will be replaced by the PID of the process, so we can track the error log easily. - f"{REACTIVE_RUNNER_LOG_DIR}/$$.log", - "2>&1", - ] - ) - logger.debug("Spawning a new reactive runner process with command: %s", command) - process = subprocess.Popen( # pylint: disable=consider-using-with # nosec - command, - shell=True, - env=env, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - user=UBUNTU_USER, - ) - - logger.info("Spawned a new reactive runner process with pid %s", process.pid) diff --git a/src/repo_policy_compliance_client.py b/src/repo_policy_compliance_client.py deleted file mode 100644 index 6dbc1d919..000000000 --- a/src/repo_policy_compliance_client.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Client for requesting repo policy compliance service.""" - -import logging -from urllib.parse import urljoin - -import requests -import urllib3 - -logger = logging.getLogger(__name__) - - -# Disable pylint public method number check as this class can be extended in the future. -class RepoPolicyComplianceClient: # pylint: disable=too-few-public-methods - """Client for repo policy compliance service. - - Attributes: - base_url: Base url to the repo policy compliance service. - token: Charm token configured for the repo policy compliance service. - """ - - def __init__(self, url: str, charm_token: str) -> None: - """Construct the RepoPolicyComplianceClient. - - Args: - url: Base URL to the repo policy compliance service. - charm_token: Charm token configured for the repo policy compliance service. - """ - self._session = self._create_session() - self.base_url = url - self.token = charm_token - - def get_one_time_token(self) -> str: - """Get a single-use token for repo policy compliance check. - - Raises: - HTTPError: If there was an error getting one-time token from repo-policy-compliance \ - service. - - Returns: - The one-time token to be used in a single request of repo policy compliance check. - """ - url = urljoin(self.base_url, "one-time-token") - try: - response = self._session.get(url, headers={"Authorization": f"Bearer {self.token}"}) - response.raise_for_status() - return response.content.decode("utf-8") - except requests.HTTPError: - logger.exception("Unable to get one time token from repo policy compliance service.") - raise - - def _create_session(self) -> requests.Session: - """Create a new requests session. - - Returns: - A new requests session with retries and no proxy settings. - """ - # The repo policy compliance service might be on localhost and should not have any proxies - # setting configured. This can be changed in the future when we also rely on an - # external service for LXD cloud. - adapter = requests.adapters.HTTPAdapter( - max_retries=urllib3.Retry( - total=3, backoff_factor=0.3, status_forcelist=[500, 502, 503, 504] - ) - ) - - session = requests.Session() - session.mount("http://", adapter) - session.mount("https://", adapter) - session.trust_env = False - return session diff --git a/src/runner.py b/src/runner.py index 61a12115c..a1b64fcfc 100644 --- a/src/runner.py +++ b/src/runner.py @@ -21,9 +21,12 @@ from typing import Iterable, NamedTuple, Optional, Sequence import yaml +from github_runner_manager.metrics.runner_logs import SYSLOG_PATH, create_logs_dir +from github_runner_manager.metrics.storage import MetricsStorage +from github_runner_manager.types_.github import GitHubOrg import shared_fs -from charm_state import Arch, GitHubOrg, SSHDebugConnection, VirtualMachineResources +from charm_state import Arch, SSHDebugConnection, VirtualMachineResources from errors import ( CreateMetricsStorageError, GithubClientError, @@ -38,8 +41,6 @@ ) from lxd import LxdInstance from lxd_type import LxdInstanceConfig -from metrics.runner_logs import SYSLOG_PATH, create_logs_dir -from metrics.storage import MetricsStorage from runner_manager_type import RunnerManagerClients from runner_type import RunnerConfig, RunnerStatus from utilities import execute_command, retry diff --git a/src/runner_manager.py b/src/runner_manager.py index 31c30ef85..66a7e03d3 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -13,12 +13,19 @@ from pathlib import Path from typing import Iterator, Optional, Type +import github_runner_manager.reactive.runner_manager as reactive_runner_manager import jinja2 import requests import requests.adapters import urllib3 +from github_runner_manager.metrics import events as metric_events +from github_runner_manager.metrics import github as github_metrics +from github_runner_manager.metrics import runner as runner_metrics +from github_runner_manager.metrics import runner_logs +from github_runner_manager.metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME +from github_runner_manager.repo_policy_compliance_client import RepoPolicyComplianceClient +from github_runner_manager.types_.github import RunnerApplication, SelfHostedRunner -import reactive.runner_manager as reactive_runner_manager import shared_fs from charm_state import VirtualMachineResources from errors import ( @@ -32,14 +39,7 @@ SubprocessError, ) from github_client import GithubClient -from github_type import RunnerApplication, SelfHostedRunner from lxd import LxdClient, LxdInstance -from metrics import events as metric_events -from metrics import github as github_metrics -from metrics import runner as runner_metrics -from metrics import runner_logs -from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME -from repo_policy_compliance_client import RepoPolicyComplianceClient from runner import LXD_PROFILE_YAML, CreateRunnerConfig, Runner, RunnerConfig, RunnerStatus from runner_manager_type import ( LXDFlushMode, diff --git a/src/runner_manager_type.py b/src/runner_manager_type.py index 95f8edcc3..deb30540b 100644 --- a/src/runner_manager_type.py +++ b/src/runner_manager_type.py @@ -9,12 +9,12 @@ from typing import Iterable import jinja2 +from github_runner_manager.repo_policy_compliance_client import RepoPolicyComplianceClient +from github_runner_manager.types_.github import GitHubPath, GitHubRunnerStatus -from charm_state import CharmState, GitHubPath, ReactiveConfig +from charm_state import CharmState, ReactiveConfig from github_client import GithubClient -from github_type import GitHubRunnerStatus from lxd import LxdClient -from repo_policy_compliance_client import RepoPolicyComplianceClient class LXDFlushMode(Enum): diff --git a/src/runner_type.py b/src/runner_type.py index 92560cbcf..eec8793ee 100644 --- a/src/runner_type.py +++ b/src/runner_type.py @@ -8,7 +8,9 @@ from pathlib import Path from typing import Optional -from charm_state import GitHubPath, SSHDebugConnection +from github_runner_manager.types_.github import GitHubPath + +from charm_state import SSHDebugConnection @dataclass diff --git a/src/shared_fs.py b/src/shared_fs.py index 28e97c4fb..48c392113 100644 --- a/src/shared_fs.py +++ b/src/shared_fs.py @@ -7,7 +7,8 @@ from pathlib import Path from typing import Iterator -import metrics.storage as metrics_storage +import github_runner_manager.metrics.storage as metrics_storage + from errors import ( CreateMetricsStorageError, DeleteMetricsStorageError, diff --git a/src/utilities.py b/src/utilities.py index a19effc5c..86c32c4d2 100644 --- a/src/utilities.py +++ b/src/utilities.py @@ -3,14 +3,18 @@ """Utilities used by the charm.""" -import functools import logging import os import pathlib import subprocess # nosec B404 -import time -from typing import Any, Callable, Optional, Sequence, Type, TypeVar - +from typing import Any, Optional, Sequence, TypeVar + +# we import the functions from the utilities module, these are used in the charm +from github_runner_manager.utilities import retry # noqa: F401 pylint: disable=unused-import +from github_runner_manager.utilities import ( # noqa: F401 pylint: disable=unused-import + secure_run_subprocess, + set_env_var, +) from typing_extensions import ParamSpec from errors import SubprocessError @@ -24,130 +28,6 @@ ReturnT = TypeVar("ReturnT") -# This decorator has default arguments, one extra argument is not a problem. -def retry( # pylint: disable=too-many-arguments - exception: Type[Exception] = Exception, - tries: int = 1, - delay: float = 0, - max_delay: Optional[float] = None, - backoff: float = 1, - local_logger: logging.Logger = logger, -) -> Callable[[Callable[ParamT, ReturnT]], Callable[ParamT, ReturnT]]: - """Parameterize the decorator for adding retry to functions. - - Args: - exception: Exception type to be retried. - tries: Number of attempts at retry. - delay: Time in seconds to wait between retry. - max_delay: Max time in seconds to wait between retry. - backoff: Factor to increase the delay by each retry. - local_logger: Logger for logging. - - Returns: - The function decorator for retry. - """ - - def retry_decorator( - func: Callable[ParamT, ReturnT], - ) -> Callable[ParamT, ReturnT]: - """Decorate function with retry. - - Args: - func: The function to decorate. - - Returns: - The resulting function with retry added. - """ - - @functools.wraps(func) - def fn_with_retry(*args: ParamT.args, **kwargs: ParamT.kwargs) -> ReturnT: - """Wrap the function with retries. - - Args: - args: The placeholder for decorated function's positional arguments. - kwargs: The placeholder for decorated function's key word arguments. - - Raises: - RuntimeError: Should be unreachable. - - Returns: - Original return type of the decorated function. - """ - remain_tries, current_delay = tries, delay - - for _ in range(tries): - try: - return func(*args, **kwargs) - # Error caught is set by the input of the function. - except exception as err: # pylint: disable=broad-exception-caught - remain_tries -= 1 - - if remain_tries == 0: - if local_logger is not None: - local_logger.exception("Retry limit of %s exceed: %s", tries, err) - raise - - if local_logger is not None: - local_logger.warning( - "Retrying error in %s seconds: %s", current_delay, err - ) - local_logger.debug("Error to be retried:", stack_info=True) - - time.sleep(current_delay) - - current_delay *= backoff - - if max_delay is not None: - current_delay = min(current_delay, max_delay) - - raise RuntimeError("Unreachable code of retry logic.") - - return fn_with_retry - - return retry_decorator - - -def secure_run_subprocess( - cmd: Sequence[str], hide_cmd: bool = False, **kwargs: dict[str, Any] -) -> subprocess.CompletedProcess[bytes]: - """Run command in subprocess according to security recommendations. - - CalledProcessError will not be raised on error of the command executed. - Errors should be handled by the caller by checking the exit code. - - The command is executed with `subprocess.run`, additional arguments can be passed to it as - keyword arguments. The following arguments to `subprocess.run` should not be set: - `capture_output`, `shell`, `check`. As those arguments are used by this function. - - Args: - cmd: Command in a list. - hide_cmd: Hide logging of cmd. - kwargs: Additional keyword arguments for the `subprocess.run` call. - - Returns: - Object representing the completed process. The outputs subprocess can accessed. - """ - if not hide_cmd: - logger.info("Executing command %s", cmd) - else: - logger.info("Executing sensitive command") - - result = subprocess.run( # nosec B603 - cmd, - capture_output=True, - # Not running in shell to avoid security problems. - shell=False, - check=False, - # Disable type check due to the support for unpacking arguments in mypy is experimental. - **kwargs, # type: ignore - ) - if not hide_cmd: - logger.debug("Command %s returns: %s", cmd, result.stdout) - else: - logger.debug("Command returns: %s", result.stdout) - return result - - def execute_command(cmd: Sequence[str], check_exit: bool = True, **kwargs: Any) -> tuple[str, int]: """Execute a command on a subprocess. @@ -203,19 +83,6 @@ def get_env_var(env_var: str) -> Optional[str]: return os.environ.get(env_var.upper(), os.environ.get(env_var.lower(), None)) -def set_env_var(env_var: str, value: str) -> None: - """Set the environment variable value. - - Set the all upper case and all low case of the `env_var`. - - Args: - env_var: Name of the environment variable. - value: Value to set environment variable to. - """ - os.environ[env_var.upper()] = value - os.environ[env_var.lower()] = value - - def bytes_with_unit_to_kib(num_bytes: str) -> int: """Convert a positive integer followed by a unit to number of kibibytes. diff --git a/templates/openstack-userdata.sh.j2 b/templates/openstack-userdata.sh.j2 deleted file mode 100644 index 047a62be1..000000000 --- a/templates/openstack-userdata.sh.j2 +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/sh - -set -e - -hostnamectl set-hostname github-runner - -# Write .env contents -su - ubuntu -c 'cd ~/actions-runner && echo "{{ env_contents }}" > .env' - -{% if aproxy_address %} -snap install aproxy --edge -snap set aproxy proxy={{ aproxy_address }} listen=:54969 -cat << EOF > /etc/nftables.conf -define default-ip = $(ip route get $(ip route show 0.0.0.0/0 | grep -oP 'via \K\S+') | grep -oP 'src \K\S+') -define private-ips = { 10.0.0.0/8, 127.0.0.1/8, 172.16.0.0/12, 192.168.0.0/16 } -table ip aproxy -flush table ip aproxy -table ip aproxy { - chain prerouting { - type nat hook prerouting priority dstnat; policy accept; - ip daddr != \$private-ips tcp dport { 80, 443 } counter dnat to \$default-ip:54969 - } - - chain output { - type nat hook output priority -100; policy accept; - ip daddr != \$private-ips tcp dport { 80, 443 } counter dnat to \$default-ip:54969 - } -} -EOF -systemctl enable nftables.service -nft -f /etc/nftables.conf -{% endif %} - -adduser ubuntu lxd -adduser ubuntu adm - -{% if dockerhub_mirror %} -echo "{\"registry-mirrors\": [\"{{ dockerhub_mirror }}\"]}" > /etc/docker/daemon.json -sudo systemctl daemon-reload -sudo systemctl restart docker -{% endif %} - -# Prepare metrics -su - ubuntu -c 'mkdir "{{ metrics_exchange_path }}"' - -# Insert pre-job script -cat << 'EOF' | su - ubuntu -c 'tee /home/ubuntu/actions-runner/pre-job.sh' -{{ pre_job_contents | safe }} -EOF - -# Create the runner and start the configuration experience -{% if runner_group %} -su - ubuntu -c "cd ~/actions-runner && ./config.sh \ - --url {{ github_url }} \ - --runnergroup '{{ runner_group }}' \ - --token {{ token }} --ephemeral --unattended \ - --labels {{ instance_labels }} --name {{ instance_name }}" -{% else %} -su - ubuntu -c "cd ~/actions-runner && ./config.sh \ - --url {{ github_url }} \ - --token {{ token }} --ephemeral --unattended \ - --labels {{ instance_labels }} --name {{ instance_name }}" -{% endif %} - - -write_post_metrics(){ - # Expects the exit code of the run.sh script as the first argument. - - # Only write the post-job metrics if the file does not already exist - which may indicate - # that the job has failed inside pre-job. - - if [ -f {{ metrics_exchange_path}}/post-job-metrics.json ]; then - return - fi - - timestamp=$(date +%s) - - # Write the post-job metrics using status abnormal and exit code if exit code is non-zero - if [ "$1" != "0" ]; then - sudo -g ubuntu -u ubuntu jq -n \ - --argjson timestamp "$timestamp" \ - --arg status "abnormal" \ - --argjson exit_code "$1" \ - '{ - "timestamp": $timestamp, - "status": $status, - "status_info": {code: $exit_code} - }' > "{{ metrics_exchange_path}}/post-job-metrics.json" - return - else - # If exit code is zero, write the post-job metrics using status normal - sudo -g ubuntu -u ubuntu jq -n \ - --argjson timestamp "$timestamp" \ - '{ - "timestamp": $timestamp, - "status": "normal" - }' > "{{ metrics_exchange_path }}/post-job-metrics.json" - fi -} - -# Run runner -# We want to capture the exit code of the run.sh script and write the post-job metrics. -(set +e; su - ubuntu -c "cd ~/actions-runner && /home/ubuntu/actions-runner/run.sh"; write_post_metrics $?) - -su - ubuntu -c "touch /home/ubuntu/run-completed" diff --git a/tests/integration/helpers/charm_metrics.py b/tests/integration/helpers/charm_metrics.py index b6c2f05bc..15cd7e3db 100644 --- a/tests/integration/helpers/charm_metrics.py +++ b/tests/integration/helpers/charm_metrics.py @@ -14,12 +14,12 @@ from github.Repository import Repository from github.Workflow import Workflow from github.WorkflowJob import WorkflowJob +from github_runner_manager.metrics.events import METRICS_LOG_PATH +from github_runner_manager.metrics.runner import PostJobStatus +from github_runner_manager.types_.github import JobConclusion from juju.application import Application from juju.unit import Unit -from github_type import JobConclusion -from metrics.events import METRICS_LOG_PATH -from metrics.runner import PostJobStatus from tests.integration.helpers.common import ( InstanceHelper, get_file_content, diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index c15afd5a5..a77c14604 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -6,12 +6,12 @@ from typing import Optional, TypedDict, cast import openstack.connection +from github_runner_manager.openstack_cloud.openstack_cloud import OpenstackCloud from juju.application import Application from juju.unit import Unit from openstack.compute.v2.server import Server from charm_state import VIRTUAL_MACHINES_CONFIG_NAME -from openstack_cloud.openstack_cloud import OpenstackCloud from tests.integration.helpers.common import InstanceHelper, reconcile, run_in_unit, wait_for logger = logging.getLogger(__name__) diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py index e3de1600d..6ce23fa0d 100644 --- a/tests/integration/test_charm_metrics_failure.py +++ b/tests/integration/test_charm_metrics_failure.py @@ -10,12 +10,12 @@ import pytest_asyncio from github.Branch import Branch from github.Repository import Repository +from github_runner_manager.metrics import runner_logs +from github_runner_manager.metrics.runner import PostJobStatus from juju.application import Application from juju.model import Model from charm_state import PATH_CONFIG_NAME, VIRTUAL_MACHINES_CONFIG_NAME -from metrics import runner_logs -from metrics.runner import PostJobStatus from tests.integration.helpers.charm_metrics import ( assert_events_after_reconciliation, cancel_workflow_run, diff --git a/tests/integration/test_charm_metrics_success.py b/tests/integration/test_charm_metrics_success.py index c9b7a8dc0..5e8254e5d 100644 --- a/tests/integration/test_charm_metrics_success.py +++ b/tests/integration/test_charm_metrics_success.py @@ -10,11 +10,11 @@ import pytest_asyncio from github.Branch import Branch from github.Repository import Repository +from github_runner_manager.metrics.runner import PostJobStatus from juju.application import Application from juju.model import Model from charm_state import PATH_CONFIG_NAME, VIRTUAL_MACHINES_CONFIG_NAME -from metrics.runner import PostJobStatus from tests.integration.helpers.charm_metrics import ( assert_events_after_reconciliation, clear_metrics_log, diff --git a/tests/integration/test_reactive.py b/tests/integration/test_reactive.py index b7445be1f..06dc6e48c 100644 --- a/tests/integration/test_reactive.py +++ b/tests/integration/test_reactive.py @@ -6,14 +6,14 @@ import secrets import pytest +from github_runner_manager.reactive.consumer import JobDetails +from github_runner_manager.reactive.runner_manager import REACTIVE_RUNNER_LOG_DIR from juju.application import Application from juju.model import Model from juju.unit import Unit from kombu import Connection from pytest_operator.plugin import OpsTest -from reactive.consumer import JobDetails -from reactive.runner_manager import REACTIVE_RUNNER_LOG_DIR from tests.integration.helpers.common import get_file_content, reconcile, run_in_unit FAKE_URL = "http://example.com" diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 63b7204b3..cb88d84ba 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -1,5 +1,5 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. """Testing the RunnerManager class with OpenStackRunnerManager as CloudManager.""" @@ -15,19 +15,28 @@ from github.Branch import Branch from github.Repository import Repository from github.Workflow import Workflow -from openstack.connection import Connection as OpenstackConnection - -from charm_state import GitHubPath, ProxyConfig, parse_github_path -from manager.cloud_runner_manager import CloudRunnerState, GitHubRunnerConfig, SupportServiceConfig -from manager.github_runner_manager import GitHubRunnerState -from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig -from metrics import events, storage -from openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH -from openstack_cloud.openstack_runner_manager import ( +from github_runner_manager.manager.cloud_runner_manager import ( + CloudRunnerState, + GitHubRunnerConfig, + SupportServiceConfig, +) +from github_runner_manager.manager.github_runner_manager import GitHubRunnerState +from github_runner_manager.manager.runner_manager import ( + FlushMode, + RunnerManager, + RunnerManagerConfig, +) +from github_runner_manager.metrics import events, storage +from github_runner_manager.openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH +from github_runner_manager.openstack_cloud.openstack_runner_manager import ( OpenStackCloudConfig, OpenStackRunnerManager, OpenStackServerConfig, ) +from github_runner_manager.types_.github import GitHubPath, parse_github_path +from openstack.connection import Connection as OpenstackConnection + +from charm_state import ProxyConfig from tests.integration.helpers.common import ( DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, dispatch_workflow, diff --git a/tests/integration/test_self_hosted_runner.py b/tests/integration/test_self_hosted_runner.py index 4232fae4b..46c8280b1 100644 --- a/tests/integration/test_self_hosted_runner.py +++ b/tests/integration/test_self_hosted_runner.py @@ -9,6 +9,7 @@ import github import pytest from github.Repository import Repository +from github_runner_manager.types_.github import GitHubRepo from juju.application import Application from juju.model import Model @@ -16,7 +17,6 @@ DOCKERHUB_MIRROR_CONFIG_NAME, PATH_CONFIG_NAME, VIRTUAL_MACHINES_CONFIG_NAME, - GitHubRepo, ) from github_client import GithubClient from tests.integration.helpers.common import ( diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index cb50275f6..c0b760144 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -8,9 +8,9 @@ from pathlib import Path import pytest +from github_runner_manager.manager.runner_scaler import RunnerScaler import utilities -from manager.runner_scaler import RunnerScaler from tests.unit.mock import MockGhapiClient, MockLxdClient, MockRepoPolicyComplianceClient @@ -76,9 +76,11 @@ def mocks(monkeypatch, tmp_path, exec_command, lxd_exec_command, runner_binary_p monkeypatch.setattr("firewall.Firewall.refresh_firewall", unittest.mock.MagicMock()) monkeypatch.setattr("runner.execute_command", lxd_exec_command) monkeypatch.setattr("runner.shared_fs", unittest.mock.MagicMock()) - monkeypatch.setattr("metrics.events.METRICS_LOG_PATH", Path(tmp_path / "metrics.log")) + monkeypatch.setattr( + "github_runner_manager.metrics.events.METRICS_LOG_PATH", Path(tmp_path / "metrics.log") + ) monkeypatch.setattr("runner.time", unittest.mock.MagicMock()) - monkeypatch.setattr("github_client.GhApi", MockGhapiClient) + monkeypatch.setattr("github_runner_manager.github_client.GhApi", MockGhapiClient) monkeypatch.setattr("runner_manager_type.jinja2", unittest.mock.MagicMock()) monkeypatch.setattr("runner_manager_type.LxdClient", MockLxdClient) monkeypatch.setattr("runner_manager.github_metrics", unittest.mock.MagicMock()) @@ -91,7 +93,7 @@ def mocks(monkeypatch, tmp_path, exec_command, lxd_exec_command, runner_binary_p monkeypatch.setattr( "runner_manager.RepoPolicyComplianceClient", MockRepoPolicyComplianceClient ) - monkeypatch.setattr("utilities.time", unittest.mock.MagicMock()) + monkeypatch.setattr("github_runner_manager.utilities.time", unittest.mock.MagicMock()) @pytest.fixture(autouse=True, name="cloud_name") @@ -108,7 +110,7 @@ def clouds_yaml_path(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Path: Path: Mocked clouds.yaml path. """ clouds_yaml_path = tmp_path / "clouds.yaml" - monkeypatch.setattr("openstack_cloud.CLOUDS_YAML_PATH", clouds_yaml_path) + monkeypatch.setattr("github_runner_manager.openstack_cloud.CLOUDS_YAML_PATH", clouds_yaml_path) return clouds_yaml_path diff --git a/tests/unit/metrics/__init__.py b/tests/unit/metrics/__init__.py deleted file mode 100644 index 188515554..000000000 --- a/tests/unit/metrics/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. diff --git a/tests/unit/metrics/test_events.py b/tests/unit/metrics/test_events.py deleted file mode 100644 index 195768291..000000000 --- a/tests/unit/metrics/test_events.py +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import json -from pathlib import Path - -from metrics import events - -TEST_LOKI_PUSH_API_URL = "http://loki:3100/api/prom/push" - - -def test_issue_events_logs_events(tmp_path: Path): - """ - arrange: Change path of the events log. - act: Issue a metric event. - assert: The expected metric log is created. - """ - event = events.RunnerInstalled(timestamp=123, flavor="small", duration=456) - - events.issue_event(event) - - assert json.loads(events.METRICS_LOG_PATH.read_text()) == { - "event": "runner_installed", - "timestamp": 123, - "flavor": "small", - "duration": 456, - } - - -def test_issue_events_exclude_none_values(tmp_path: Path): - """ - arrange: Change path of the events log. - act: Issue a metric event with a None value. - assert: The expected metric log without the None value is created. - """ - event = events.RunnerStop( - timestamp=123, - flavor="small", - workflow="workflow", - repo="repo", - github_event="github_event", - status="status", - status_info=None, - job_duration=456, - ) - - events.issue_event(event) - - assert json.loads(events.METRICS_LOG_PATH.read_text()) == { - "event": "runner_stop", - "timestamp": 123, - "flavor": "small", - "workflow": "workflow", - "repo": "repo", - "github_event": "github_event", - "status": "status", - "job_duration": 456, - } diff --git a/tests/unit/metrics/test_github.py b/tests/unit/metrics/test_github.py deleted file mode 100644 index 78a21e4e1..000000000 --- a/tests/unit/metrics/test_github.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import secrets -from datetime import datetime, timedelta, timezone -from random import randint -from unittest.mock import MagicMock - -import pytest - -from errors import GithubMetricsError, JobNotFoundError -from github_client import GithubClient -from github_type import JobConclusion, JobStats -from metrics import github as github_metrics -from metrics.runner import PreJobMetrics - - -@pytest.fixture(name="pre_job_metrics") -def pre_job_metrics_fixture() -> PreJobMetrics: - """Create a PreJobMetrics object.""" - return PreJobMetrics( - repository="owner/repo", - workflow_run_id=1, - workflow="workflow", - job_name="job", - job_started_at=datetime(2021, 10, 1, 1, 0, 0, tzinfo=timezone.utc), - timestamp=1234567890, - event="push", - ) - - -def test_job(pre_job_metrics: PreJobMetrics): - """ - arrange: create a GithubClient mock which returns a GithubJobStats object. - act: Call job. - assert: the job metrics are returned. - """ - github_client = MagicMock(spec=GithubClient) - runner_name = secrets.token_hex(16) - created_at = datetime(2021, 10, 1, 0, 0, 0, tzinfo=timezone.utc) - started_at = created_at + timedelta(seconds=3600) - github_client.get_job_info.return_value = JobStats( - created_at=created_at, - started_at=started_at, - runner_name=runner_name, - conclusion=JobConclusion.SUCCESS, - job_id=randint(1, 1000), - ) - - job_metrics = github_metrics.job( - github_client=github_client, pre_job_metrics=pre_job_metrics, runner_name=runner_name - ) - - assert job_metrics.queue_duration == 3600 - assert job_metrics.conclusion == JobConclusion.SUCCESS - - -def test_job_job_not_found(pre_job_metrics: PreJobMetrics): - """ - arrange: create a GithubClient mock which raises a JobNotFound exception. - act: Call job. - assert: a GithubMetricsError is raised. - """ - github_client = MagicMock(spec=GithubClient) - runner_name = secrets.token_hex(16) - github_client.get_job_info.side_effect = JobNotFoundError("Job not found") - - with pytest.raises(GithubMetricsError): - github_metrics.job( - github_client=github_client, pre_job_metrics=pre_job_metrics, runner_name=runner_name - ) diff --git a/tests/unit/metrics/test_runner.py b/tests/unit/metrics/test_runner.py deleted file mode 100644 index bf0a14251..000000000 --- a/tests/unit/metrics/test_runner.py +++ /dev/null @@ -1,649 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import json -import secrets -from pathlib import Path -from unittest.mock import MagicMock, call - -import pytest - -from errors import DeleteMetricsStorageError, IssueMetricEventError -from github_type import JobConclusion -from metrics import events as metric_events -from metrics import runner as runner_metrics -from metrics import type as metrics_type -from metrics.events import RunnerStart, RunnerStop -from metrics.runner import ( - RUNNER_INSTALLED_TS_FILE_NAME, - PostJobMetrics, - PreJobMetrics, - RunnerMetrics, -) -from metrics.storage import MetricsStorage - - -@pytest.fixture(autouse=True, name="issue_event_mock") -def issue_event_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: - """Mock the issue_event function.""" - issue_event_mock = MagicMock() - monkeypatch.setattr("metrics.events.issue_event", issue_event_mock) - return issue_event_mock - - -@pytest.fixture(name="runner_fs_base") -def runner_fs_base_fixture(tmp_path: Path) -> Path: - """Create a runner filesystem base.""" - runner_fs_base = tmp_path / "runner-fs" - runner_fs_base.mkdir(exist_ok=True) - return runner_fs_base - - -def _create_metrics_data(runner_name: str) -> RunnerMetrics: - """Create a RunnerMetrics object that is suitable for most tests. - - Args: - runner_name: The test runner name. - - Returns: - Test metrics data. - """ - return RunnerMetrics( - installed_timestamp=1, - pre_job=PreJobMetrics( - timestamp=1, - workflow="workflow1", - workflow_run_id="workflow_run_id1", - repository="org1/repository1", - event="push", - ), - post_job=PostJobMetrics(timestamp=3, status=runner_metrics.PostJobStatus.NORMAL), - runner_name=runner_name, - ) - - -def _create_runner_fs_base(tmp_path: Path): - """Create a runner filesystem base. - - Args: - tmp_path: The temporary path to create test runner filesystem under. - - Returns: - The runner filesystem temporary path. - """ - runner_fs_base = tmp_path / "runner-fs" - runner_fs_base.mkdir(exist_ok=True) - return runner_fs_base - - -def _create_runner_files( - runner_fs_base: Path, - runner_name: str, - pre_job_data: str | bytes | None, - post_job_data: str | bytes | None, - installed_timestamp: str | bytes | None, -) -> MetricsStorage: - """Create runner files inside shared fs. - - If the data is bytes, the file is written as binary, otherwise as text. - If data is None, it is not written. - - Args: - runner_fs_base: The base path of the shared fs. - runner_name: The runner name. - pre_job_data: The pre-job metrics data. - post_job_data: The post-job metrics data. - installed_timestamp: The installed timestamp. - - Returns: - A SharedFilesystem instance. - """ - runner_fs = runner_fs_base / runner_name - runner_fs.mkdir() - if pre_job_data: - if isinstance(pre_job_data, bytes): - runner_fs.joinpath(runner_metrics.PRE_JOB_METRICS_FILE_NAME).write_bytes(pre_job_data) - else: - runner_fs.joinpath(runner_metrics.PRE_JOB_METRICS_FILE_NAME).write_text( - pre_job_data, encoding="utf-8" - ) - - if post_job_data: - if isinstance(post_job_data, bytes): - runner_fs.joinpath(runner_metrics.POST_JOB_METRICS_FILE_NAME).write_bytes( - post_job_data - ) - else: - runner_fs.joinpath(runner_metrics.POST_JOB_METRICS_FILE_NAME).write_text( - post_job_data, encoding="utf-8" - ) - - if installed_timestamp: - if isinstance(installed_timestamp, bytes): - runner_fs.joinpath(RUNNER_INSTALLED_TS_FILE_NAME).write_bytes(installed_timestamp) - else: - runner_fs.joinpath(RUNNER_INSTALLED_TS_FILE_NAME).write_text( - installed_timestamp, encoding="utf-8" - ) - return MetricsStorage(path=runner_fs, runner_name=runner_name) - - -def test_extract(runner_fs_base: Path): - """ - arrange: \ - 1. A runner with all metrics inside shared fs. \ - 2. A runner with only pre-job metrics inside shared fs. \ - 3. A runner with no metrics except installed_timestamp inside shared fs. - act: Call extract - assert: All shared filesystems are removed and for runners - 1. + 2. metrics are extracted - 3. no metrics are extracted - """ - runner_all_metrics_name = secrets.token_hex(16) - runner_all_metrics = _create_metrics_data(runner_all_metrics_name) - runner_wihout_post_job_name = secrets.token_hex(16) - runner_without_post_job_metrics = runner_all_metrics.copy() - runner_without_post_job_metrics.post_job = None - runner_without_post_job_metrics.runner_name = runner_wihout_post_job_name - - # 1. Runner has all metrics inside shared fs - runner1_fs = _create_runner_files( - runner_fs_base, - runner_all_metrics_name, - runner_all_metrics.pre_job.json(), - runner_all_metrics.post_job.json(), - str(runner_all_metrics.installed_timestamp), - ) - - # 2. Runner has only pre-job metrics inside shared fs - runner2_fs = _create_runner_files( - runner_fs_base, - runner_wihout_post_job_name, - runner_without_post_job_metrics.pre_job.json(), - None, - str(runner_without_post_job_metrics.installed_timestamp), - ) - - # 3. Runner has no metrics except installed_timestamp inside shared fs - runner3_fs = _create_runner_files(runner_fs_base, secrets.token_hex(16), None, None, "5") - - metrics_storage_manager = MagicMock() - metrics_storage_manager.list_all.return_value = [runner1_fs, runner2_fs, runner3_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - - assert extracted_metrics == [ - runner_all_metrics, - runner_without_post_job_metrics, - ] - metrics_storage_manager.delete.assert_has_calls( - [ - ((runner1_fs.runner_name,),), - ((runner2_fs.runner_name,),), - ((runner3_fs.runner_name,),), - ] - ) - - -def test_extract_ignores_runners(runner_fs_base: Path): - """ - arrange: Runners with metrics. - act: Call extract with some runners on ignore list. - expect: The ignored runners are not processed. - """ - runner_metrics_data = [] - - runner_filesystems = [] - for i in range(5): - runner_name = secrets.token_hex(16) - data = _create_metrics_data(runner_name) - data.pre_job.workflow = f"workflow{i}" - runner_metrics_data.append(data) - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - data.pre_job.json(), - data.post_job.json(), - str(data.installed_timestamp), - ) - runner_filesystems.append(runner_fs) - - metrics_storage_manager = MagicMock() - metrics_storage_manager.list_all.return_value = runner_filesystems - - ignore_runners = {runner_filesystems[0].runner_name, runner_filesystems[2].runner_name} - - extracted_metrics = list( - runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, runners=ignore_runners - ) - ) - - assert extracted_metrics == runner_metrics_data[1:2] + runner_metrics_data[3:] - - -def test_extract_corrupt_data(runner_fs_base: Path, monkeypatch: pytest.MonkeyPatch): - """ - arrange: \ - 1. A runner with non-compliant pre-job metrics inside shared fs. \ - 2. A runner with non-json post-job metrics inside shared fs. \ - 3. A runner with json array post-job metrics inside shared fs. \ - 4. A runner with no real timestamp in installed_timestamp file inside shared fs. - act: Call extract. - assert: No metrics are extracted is issued and shared filesystems are quarantined in all cases. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) - - # 1. Runner has noncompliant pre-job metrics inside shared fs - invalid_pre_job_data = runner_metrics_data.pre_job.copy(update={"timestamp": -1}) - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - invalid_pre_job_data.json(), - runner_metrics_data.post_job.json(), - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager = MagicMock() - metrics_storage_manager.list_all.return_value = [runner_fs] - move_to_quarantine_mock = MagicMock() - monkeypatch.setattr(runner_metrics, "move_to_quarantine", move_to_quarantine_mock) - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - - assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - # 2. Runner has non-json post-job metrics inside shared fs - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - b"\x00", - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - # 3. Runner has json post-job metrics but a json array (not object) inside shared fs. - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - json.dumps([runner_metrics_data.post_job.dict()]), - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - # 4. Runner has not a timestamp in installed_timestamp file inside shared fs - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - runner_metrics_data.post_job.json(), - b"\x00", - ) - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - assert not extracted_metrics - - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - -def test_extract_raises_error_for_too_large_files( - runner_fs_base: Path, issue_event_mock: MagicMock, monkeypatch: pytest.MonkeyPatch -): - """ - arrange: Runners with too large metric and timestamp files. - act: Call extract. - assert: No metrics are extracted and shared filesystems is quarantined. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - - # 1. Runner has a pre-job metrics file that is too large - invalid_pre_job_data = runner_metrics_data.pre_job.copy( - update={"workflow": "a" * runner_metrics.FILE_SIZE_BYTES_LIMIT + "b"} - ) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - invalid_pre_job_data.json(), - runner_metrics_data.post_job.json(), - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager = MagicMock() - - metrics_storage_manager.list_all.return_value = [runner_fs] - - move_to_quarantine_mock = MagicMock() - monkeypatch.setattr(runner_metrics, "move_to_quarantine", move_to_quarantine_mock) - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - assert not extracted_metrics - - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - # 2. Runner has a post-job metrics file that is too large - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - invalid_post_job_data = runner_metrics_data.post_job.copy( - update={"status": "a" * runner_metrics.FILE_SIZE_BYTES_LIMIT + "b"} - ) - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - invalid_post_job_data.json(), - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - - assert not extracted_metrics - - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - # 3. Runner has an installed_timestamp file that is too large - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - - invalid_ts = "1" * (runner_metrics.FILE_SIZE_BYTES_LIMIT + 1) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - runner_metrics_data.post_job.json(), - invalid_ts, - ) - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - - assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - -def test_extract_ignores_filesystems_without_ts(runner_fs_base: Path): - """ - arrange: A runner without installed_timestamp file inside shared fs. - act: Call extract. - assert: No metrics are extracted and shared filesystem is removed. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = RunnerMetrics.construct( - installed_timestamp=1, - pre_job=PreJobMetrics( - timestamp=1, - workflow="workflow1", - workflow_run_id="workflow_run_id1", - repository="org1/repository1", - event="push", - ), - post_job=PostJobMetrics(timestamp=3, status=runner_metrics.PostJobStatus.NORMAL), - runner_name=runner_name, - ) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - runner_metrics_data.post_job.json(), - None, - ) - metrics_storage_manager = MagicMock() - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - assert not extracted_metrics - metrics_storage_manager.delete.assert_called_once_with(runner_fs.runner_name) - - -def test_extract_ignores_failure_on_shared_fs_cleanup( - runner_fs_base: Path, - caplog: pytest.LogCaptureFixture, -): - """ - arrange: Mock the shared_fs.delete to raise an exception. - act: Call extract. - assert: The metric is extracted and the exception is caught and logged. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - runner_fs = _create_runner_files( - runner_fs_base, - runner_metrics_data.runner_name, - runner_metrics_data.pre_job.json(), - runner_metrics_data.post_job.json(), - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager = MagicMock() - - metrics_storage_manager.list_all.return_value = [runner_fs] - - metrics_storage_manager.delete.side_effect = DeleteMetricsStorageError( - "Failed to delete shared filesystem" - ) - - extracted_metrics = runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, runners=set() - ) - assert list(extracted_metrics) == [runner_metrics_data] - - assert "Failed to delete shared filesystem" in caplog.text - - -def test_issue_events(issue_event_mock: MagicMock): - """ - arrange: A runner with all metrics. - act: Call issue_events. - assert: RunnerStart and RunnerStop metrics are issued. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - - flavor = secrets.token_hex(16) - job_metrics = metrics_type.GithubJobMetrics( - queue_duration=3600, conclusion=JobConclusion.SUCCESS - ) - issued_metrics = runner_metrics.issue_events( - runner_metrics=runner_metrics_data, flavor=flavor, job_metrics=job_metrics - ) - assert issued_metrics == {metric_events.RunnerStart, metric_events.RunnerStop} - issue_event_mock.assert_has_calls( - [ - # 1. Runner - call( - RunnerStart( - timestamp=runner_metrics_data.pre_job.timestamp, - flavor=flavor, - workflow=runner_metrics_data.pre_job.workflow, - repo=runner_metrics_data.pre_job.repository, - github_event=runner_metrics_data.pre_job.event, - idle=runner_metrics_data.pre_job.timestamp - - runner_metrics_data.installed_timestamp, - queue_duration=job_metrics.queue_duration, - ) - ), - call( - RunnerStop( - timestamp=runner_metrics_data.post_job.timestamp, - flavor=flavor, - workflow=runner_metrics_data.pre_job.workflow, - repo=runner_metrics_data.pre_job.repository, - github_event=runner_metrics_data.pre_job.event, - status=runner_metrics_data.post_job.status, - job_duration=runner_metrics_data.post_job.timestamp - - runner_metrics_data.pre_job.timestamp, - job_conclusion=job_metrics.conclusion, - ) - ), - ] - ) - - -def test_issue_events_pre_job_before_runner_installed(issue_event_mock: MagicMock): - """ - arrange: A runner with pre-job timestamp smaller than installed timestamp. - act: Call issue_events. - assert: RunnerStart metric is issued with idle set to 0. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - runner_metrics_data.pre_job.timestamp = 0 - - flavor = secrets.token_hex(16) - job_metrics = metrics_type.GithubJobMetrics( - queue_duration=3600, conclusion=JobConclusion.SUCCESS - ) - issued_metrics = runner_metrics.issue_events( - runner_metrics=runner_metrics_data, flavor=flavor, job_metrics=job_metrics - ) - assert metric_events.RunnerStart in issued_metrics - issue_event_mock.assert_has_calls( - [ - call( - RunnerStart( - timestamp=runner_metrics_data.pre_job.timestamp, - flavor=flavor, - workflow=runner_metrics_data.pre_job.workflow, - repo=runner_metrics_data.pre_job.repository, - github_event=runner_metrics_data.pre_job.event, - idle=0, - queue_duration=job_metrics.queue_duration, - ) - ) - ] - ) - - -def test_issue_events_post_job_before_pre_job(issue_event_mock: MagicMock): - """ - arrange: A runner with post-job timestamp smaller than pre-job timestamps. - act: Call issue_events. - assert: job_duration is set to zero. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - runner_metrics_data.post_job = PostJobMetrics( - timestamp=0, status=runner_metrics.PostJobStatus.NORMAL - ) - flavor = secrets.token_hex(16) - job_metrics = metrics_type.GithubJobMetrics( - queue_duration=3600, conclusion=JobConclusion.SUCCESS - ) - issued_metrics = runner_metrics.issue_events( - runner_metrics=runner_metrics_data, flavor=flavor, job_metrics=job_metrics - ) - - assert metric_events.RunnerStop in issued_metrics - issue_event_mock.assert_has_calls( - [ - call( - RunnerStop( - timestamp=runner_metrics_data.post_job.timestamp, - flavor=flavor, - workflow=runner_metrics_data.pre_job.workflow, - repo=runner_metrics_data.pre_job.repository, - github_event=runner_metrics_data.pre_job.event, - status=runner_metrics_data.post_job.status, - job_duration=0, - job_conclusion=job_metrics.conclusion, - ) - ), - ] - ) - - -def test_issue_events_no_post_job_metrics(issue_event_mock: MagicMock): - """ - arrange: A runner without post-job metrics. - act: Call issue_events. - assert: Only RunnerStart metric is issued. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - runner_metrics_data.post_job = None - flavor = secrets.token_hex(16) - job_metrics = metrics_type.GithubJobMetrics( - queue_duration=3600, conclusion=JobConclusion.SUCCESS - ) - issued_metrics = runner_metrics.issue_events( - runner_metrics=runner_metrics_data, flavor=flavor, job_metrics=job_metrics - ) - assert issued_metrics == {metric_events.RunnerStart} - - issue_event_mock.assert_called_once_with( - RunnerStart( - timestamp=runner_metrics_data.pre_job.timestamp, - flavor=flavor, - workflow=runner_metrics_data.pre_job.workflow, - repo=runner_metrics_data.pre_job.repository, - github_event=runner_metrics_data.pre_job.event, - idle=runner_metrics_data.pre_job.timestamp - runner_metrics_data.installed_timestamp, - queue_duration=job_metrics.queue_duration, - ) - ) - - -def test_issue_events_returns_empty_set_on_issue_event_failure( - issue_event_mock: MagicMock, - caplog: pytest.LogCaptureFixture, -): - """ - arrange: Mock the issue_event_mock to raise an exception on the first call. - act: Call issue_events. - assert: No metrics at all are issued. The exception is caught and logged. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - - issue_event_mock.side_effect = [IssueMetricEventError("Failed to issue metric"), None] - - flavor = secrets.token_hex(16) - job_metrics = metrics_type.GithubJobMetrics( - queue_duration=3600, conclusion=JobConclusion.SUCCESS - ) - - issued_metrics = runner_metrics.issue_events( - runner_metrics=runner_metrics_data, flavor=flavor, job_metrics=job_metrics - ) - assert not issued_metrics - assert "Failed to issue metric" in caplog.text diff --git a/tests/unit/metrics/test_runner_logs.py b/tests/unit/metrics/test_runner_logs.py deleted file mode 100644 index d53dc17cf..000000000 --- a/tests/unit/metrics/test_runner_logs.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -from pathlib import Path - -import pytest - -from metrics import runner_logs - - -@pytest.fixture(name="log_dir_base_path") -def log_dir_base_path_fixture(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: - """Mock the log directory path and return it.""" - log_dir_base_path = tmp_path / "log_dir" - monkeypatch.setattr(runner_logs, "RUNNER_LOGS_DIR_PATH", log_dir_base_path) - return log_dir_base_path - - -def test_remove_outdated_crashed(log_dir_base_path: Path, monkeypatch: pytest.MonkeyPatch): - """ - arrange: Mock the base log directory path. - act: Remove the logs of the runner. - assert: The expected logs are removed. - """ - monkeypatch.setattr(runner_logs, "OUTDATED_LOGS_IN_SECONDS", 0) - - log_dir_path = log_dir_base_path / "test-runner" - log_dir_path.mkdir(parents=True) - - runner_logs.remove_outdated() - - assert not log_dir_path.exists() diff --git a/tests/unit/metrics/test_storage.py b/tests/unit/metrics/test_storage.py deleted file mode 100644 index bc8d0e94c..000000000 --- a/tests/unit/metrics/test_storage.py +++ /dev/null @@ -1,168 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import secrets -import tarfile -from pathlib import Path - -import pytest - -from errors import ( - CreateMetricsStorageError, - DeleteMetricsStorageError, - GetMetricsStorageError, - QuarantineMetricsStorageError, -) -from metrics import storage -from metrics.storage import MetricsStorage - - -@pytest.fixture(autouse=True, name="filesystem_paths") -def filesystem_paths_fixture(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> dict[str, Path]: - """Mock the hardcoded filesystem paths.""" - ms_path = tmp_path / "runner-fs" - ms_quarantine_path = tmp_path / "quarantine" - monkeypatch.setattr(storage, "FILESYSTEM_BASE_PATH", ms_path) - monkeypatch.setattr(storage, "FILESYSTEM_QUARANTINE_PATH", ms_quarantine_path) - return {"base": ms_path, "quarantine": ms_quarantine_path} - - -def test_create_creates_directory(): - """ - arrange: Given a runner name and a path for the storage. - act: Call create. - assert: The directory is created. - """ - runner_name = secrets.token_hex(16) - - fs = storage.create(runner_name) - - assert fs.path.exists() - assert fs.path.is_dir() - - -def test_create_raises_exception_if_already_exists(): - """ - arrange: Given a runner name and an already existing shared filesystem. - act: Call create. - assert: The expected exception is raised. - """ - runner_name = secrets.token_hex(16) - storage.create(runner_name) - - with pytest.raises(CreateMetricsStorageError): - storage.create(runner_name) - - -def test_list_all(): - """ - arrange: Create metric storages for multiple runners. - act: Call list_all. - assert: A generator listing all the shared filesystems is returned. - """ - runner_names = [secrets.token_hex(16) for _ in range(3)] - for runner_name in runner_names: - storage.create(runner_name) - - fs_list = list(storage.list_all()) - - assert len(fs_list) == 3 - for fs in fs_list: - assert isinstance(fs, storage.MetricsStorage) - assert fs.runner_name in runner_names - - -def test_list_all_empty(): - """ - arrange: Nothing. - act: Call list_all. - assert: An empty iterator is returned. - """ - fs_list = list(storage.list_all()) - - assert len(fs_list) == 0 - - -def test_delete(): - """ - arrange: Create metrics storage for a runner. - act: Call delete - assert: The storage is deleted. - """ - runner_name = secrets.token_hex(16) - storage.create(runner_name) - - storage.delete(runner_name) - - with pytest.raises(GetMetricsStorageError): - storage.get(runner_name) - - -def test_delete_raises_error(): - """ - arrange: Nothing. - act: Call delete. - assert: A DeleteMetricsStorageError is raised. - """ - runner_name = secrets.token_hex(16) - - with pytest.raises(DeleteMetricsStorageError): - storage.delete(runner_name) - - -def test_get(): - """ - arrange: Given a runner name. - act: Call create and get. - assert: A metrics storage object for this runner is returned. - """ - runner_name = secrets.token_hex(16) - - storage.create(runner_name) - ms = storage.get(runner_name) - - assert isinstance(ms, MetricsStorage) - assert ms.runner_name == runner_name - - -def test_get_raises_error_if_not_found(): - """ - arrange: Nothing. - act: Call get. - assert: A GetMetricsStorageError is raised. - """ - runner_name = secrets.token_hex(16) - - with pytest.raises(GetMetricsStorageError): - storage.get(runner_name) - - -def test_quarantine(filesystem_paths: dict[str, Path], tmp_path: Path): - """ - arrange: Create a storage for a runner with a file in it. - act: Call quarantine. - assert: The storage is moved to the quarantine. - """ - runner_name = secrets.token_hex(16) - ms = storage.create(runner_name) - ms.path.joinpath("test.txt").write_text("foo bar") - - storage.move_to_quarantine(storage, runner_name) - - tarfile_path = filesystem_paths["quarantine"].joinpath(runner_name).with_suffix(".tar.gz") - assert tarfile_path.exists() - tarfile.open(tarfile_path).extractall(path=tmp_path) - assert tmp_path.joinpath(f"{runner_name}/test.txt").exists() - assert tmp_path.joinpath(f"{runner_name}/test.txt").read_text(encoding="utf-8") == "foo bar" - assert not ms.path.exists() - - -def test_quarantine_raises_error(): - """ - arrange: Nothing. - act: Call quarantine. - assert: A QuarantineMetricsStorageError is raised. - """ - runner_name = secrets.token_hex(16) - - with pytest.raises(QuarantineMetricsStorageError): - storage.move_to_quarantine(storage, runner_name) diff --git a/tests/unit/mock.py b/tests/unit/mock.py index be3e07ca7..78c0c6990 100644 --- a/tests/unit/mock.py +++ b/tests/unit/mock.py @@ -12,8 +12,9 @@ from pathlib import Path from typing import IO, Optional, Sequence, Union +from github_runner_manager.types_.github import RegistrationToken, RemoveToken, RunnerApplication + from errors import LxdError, RunnerError -from github_type import RegistrationToken, RemoveToken, RunnerApplication from lxd_type import LxdNetwork from runner import LxdInstanceConfig diff --git a/tests/unit/mock_runner_managers.py b/tests/unit/mock_runner_managers.py index 443c84dfd..b52afa538 100644 --- a/tests/unit/mock_runner_managers.py +++ b/tests/unit/mock_runner_managers.py @@ -7,17 +7,18 @@ from typing import Iterable, Iterator, Sequence from unittest.mock import MagicMock -from charm_state import GitHubPath -from github_client import GithubClient -from github_type import GitHubRunnerStatus, SelfHostedRunner -from manager.cloud_runner_manager import ( +from github_runner_manager.manager.cloud_runner_manager import ( CloudRunnerInstance, CloudRunnerManager, CloudRunnerState, InstanceId, ) -from manager.github_runner_manager import GitHubRunnerState -from metrics.runner import RunnerMetrics +from github_runner_manager.manager.github_runner_manager import GitHubRunnerState +from github_runner_manager.metrics.runner import RunnerMetrics +from github_runner_manager.types_.github import GitHubRunnerStatus, SelfHostedRunner + +from charm_state import GitHubPath +from github_client import GithubClient from tests.unit.mock import MockGhapiClient diff --git a/tests/unit/reactive/__init__.py b/tests/unit/reactive/__init__.py deleted file mode 100644 index 188515554..000000000 --- a/tests/unit/reactive/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. diff --git a/tests/unit/reactive/test_consumer.py b/tests/unit/reactive/test_consumer.py deleted file mode 100644 index 2a443c9b3..000000000 --- a/tests/unit/reactive/test_consumer.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -import secrets -from contextlib import closing - -import pytest -from kombu import Connection, Message - -from reactive import consumer -from reactive.consumer import JobError - -IN_MEMORY_URI = "memory://" -FAKE_RUN_URL = "https://api.github.com/repos/fakeusergh-runner-test/actions/runs/8200803099" - - -def test_consume(caplog: pytest.LogCaptureFixture): - """ - arrange: A job placed in the message queue. - act: Call consume - assert: The job is logged. - """ - queue_name = secrets.token_hex(16) - job_details = consumer.JobDetails( - labels=[secrets.token_hex(16), secrets.token_hex(16)], - run_url=FAKE_RUN_URL, - ) - _put_in_queue(job_details.json(), queue_name) - - # we use construct to avoid pydantic validation as IN_MEMORY_URI is not a valid URL - consumer.consume(IN_MEMORY_URI, queue_name) - assert str(job_details.labels) in caplog.text - assert job_details.run_url in caplog.text - - -@pytest.mark.parametrize( - "job_str", - [ - pytest.param( - '{"labels": ["label1", "label2"], "status": "completed"}', id="run_url missing" - ), - pytest.param( - '{"status": "completed", "run_url": "https://example.com"}', id="labels missing" - ), - pytest.param("no json at all", id="invalid json"), - ], -) -def test_job_details_validation_error(job_str: str): - """ - arrange: A job placed in the message queue with invalid details. - act: Call consume - assert: A JobError is raised and the message is requeued. - """ - queue_name = secrets.token_hex(16) - _put_in_queue(job_str, queue_name) - - with pytest.raises(JobError) as exc_info: - consumer.consume(IN_MEMORY_URI, queue_name) - assert "Invalid job details" in str(exc_info.value) - - # Ensure message has been requeued by reconsuming it - msg = _consume_from_queue(queue_name) - assert msg.payload == job_str - - -def _put_in_queue(msg: str, queue_name: str) -> None: - """Put a job in the message queue. - - Args: - msg: The job details. - queue_name: The name of the queue - """ - with Connection(IN_MEMORY_URI) as conn: - with closing(conn.SimpleQueue(queue_name)) as simple_queue: - simple_queue.put(msg, retry=True) - - -def _consume_from_queue(queue_name: str) -> Message: - """Consume a job from the message queue. - - Args: - queue_name: The name of the queue - - Returns: - The message consumed from the queue. - """ - with Connection(IN_MEMORY_URI) as conn: - with closing(conn.SimpleQueue(queue_name)) as simple_queue: - return simple_queue.get(block=False) diff --git a/tests/unit/reactive/test_runner_manager.py b/tests/unit/reactive/test_runner_manager.py deleted file mode 100644 index cd25cf728..000000000 --- a/tests/unit/reactive/test_runner_manager.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import os -import secrets -import subprocess -from pathlib import Path -from subprocess import CompletedProcess -from unittest.mock import MagicMock - -import pytest - -from reactive.runner_manager import ( - PIDS_COMMAND_LINE, - PYTHON_BIN, - REACTIVE_RUNNER_SCRIPT_FILE, - ReactiveRunnerError, - reconcile, -) -from utilities import secure_run_subprocess - -EXAMPLE_MQ_URI = "http://example.com" - - -@pytest.fixture(name="log_dir", autouse=True) -def log_dir_path_fixture(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: - """Return the path to the log file.""" - log_file_path = tmp_path / "logs" - monkeypatch.setattr("reactive.runner_manager.REACTIVE_RUNNER_LOG_DIR", log_file_path) - monkeypatch.setattr("shutil.chown", lambda *args, **kwargs: None) - return log_file_path - - -@pytest.fixture(name="secure_run_subprocess_mock") -def secure_run_subprocess_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: - """Mock the ps command.""" - secure_run_subprocess_mock = MagicMock(spec=secure_run_subprocess) - monkeypatch.setattr( - "reactive.runner_manager.secure_run_subprocess", secure_run_subprocess_mock - ) - return secure_run_subprocess_mock - - -@pytest.fixture(name="os_kill_mock", autouse=True) -def os_kill_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: - """Mock the os.kill function.""" - os_kill_mock = MagicMock(spec=os.kill) - monkeypatch.setattr("os.kill", os_kill_mock) - return os_kill_mock - - -@pytest.fixture(name="subprocess_popen_mock") -def subprocess_popen_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: - """Mock the subprocess.Popen function.""" - popen_result = MagicMock(spec=subprocess.Popen, pid=1234, returncode=0) - subprocess_popen_mock = MagicMock( - spec=subprocess.Popen, - return_value=popen_result, - ) - monkeypatch.setattr("subprocess.Popen", subprocess_popen_mock) - return subprocess_popen_mock - - -def test_reconcile_spawns_runners( - secure_run_subprocess_mock: MagicMock, subprocess_popen_mock: MagicMock, log_dir: Path -): - """ - arrange: Mock that two reactive runner processes are active. - act: Call reconcile with a quantity of 5. - assert: Three runners are spawned. Log file is setup. - """ - queue_name = secrets.token_hex(16) - _arrange_reactive_processes(secure_run_subprocess_mock, count=2) - - delta = reconcile(5, mq_uri=EXAMPLE_MQ_URI, queue_name=queue_name) - - assert delta == 3 - assert subprocess_popen_mock.call_count == 3 - assert log_dir.exists() - - -def test_reconcile_does_not_spawn_runners( - secure_run_subprocess_mock: MagicMock, subprocess_popen_mock: MagicMock -): - """ - arrange: Mock that two reactive runner processes are active. - act: Call reconcile with a quantity of 2. - assert: No runners are spawned. - """ - queue_name = secrets.token_hex(16) - _arrange_reactive_processes(secure_run_subprocess_mock, count=2) - - delta = reconcile(2, mq_uri=EXAMPLE_MQ_URI, queue_name=queue_name) - - assert delta == 0 - assert subprocess_popen_mock.call_count == 0 - - -def test_reconcile_kills_processes_for_too_many_processes( - secure_run_subprocess_mock: MagicMock, - subprocess_popen_mock: MagicMock, - os_kill_mock: MagicMock, -): - """ - arrange: Mock that 3 reactive runner processes are active. - act: Call reconcile with a quantity of 1. - assert: 2 processes are killed. - """ - queue_name = secrets.token_hex(16) - _arrange_reactive_processes(secure_run_subprocess_mock, count=3) - delta = reconcile(1, mq_uri=EXAMPLE_MQ_URI, queue_name=queue_name) - - assert delta == -2 - assert subprocess_popen_mock.call_count == 0 - assert os_kill_mock.call_count == 2 - - -def test_reconcile_ignore_process_not_found_on_kill( - secure_run_subprocess_mock: MagicMock, - subprocess_popen_mock: MagicMock, - os_kill_mock: MagicMock, -): - """ - arrange: Mock 3 reactive processes and os.kill to fail once with a ProcessLookupError. - act: Call reconcile with a quantity of 1. - assert: The returned delta is still -2. - """ - queue_name = secrets.token_hex(16) - _arrange_reactive_processes(secure_run_subprocess_mock, count=3) - os_kill_mock.side_effect = [None, ProcessLookupError] - delta = reconcile(1, mq_uri=EXAMPLE_MQ_URI, queue_name=queue_name) - - assert delta == -2 - assert subprocess_popen_mock.call_count == 0 - assert os_kill_mock.call_count == 2 - - -def test_reconcile_raises_reactive_runner_error_on_ps_failure( - secure_run_subprocess_mock: MagicMock, -): - """ - arrange: Mock that the ps command fails. - act: Call reconcile with a quantity of 1. - assert: A ReactiveRunnerError is raised. - """ - queue_name = secrets.token_hex(16) - secure_run_subprocess_mock.return_value = CompletedProcess( - args=PIDS_COMMAND_LINE, - returncode=1, - stdout=b"", - stderr=b"error", - ) - - with pytest.raises(ReactiveRunnerError) as err: - reconcile(1, mq_uri=EXAMPLE_MQ_URI, queue_name=queue_name) - - assert "Failed to get list of processes" in str(err.value) - - -def _arrange_reactive_processes(secure_run_subprocess_mock: MagicMock, count: int): - """Mock reactive runner processes are active. - - Args: - secure_run_subprocess_mock: The mock to use for the ps command. - count: The number of processes. - """ - process_cmds_before = "\n".join( - [f"{PYTHON_BIN} {REACTIVE_RUNNER_SCRIPT_FILE}\t{i}" for i in range(count)] - ) - - secure_run_subprocess_mock.return_value = CompletedProcess( - args=PIDS_COMMAND_LINE, - returncode=0, - stdout=f"CMD\n{process_cmds_before}".encode("utf-8"), - stderr=b"", - ) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 060bbc96f..a28fc9743 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -12,6 +12,7 @@ import pytest import yaml +from github_runner_manager.types_.github import GitHubOrg, GitHubRepo, GitHubRunnerStatus from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, StatusBase, WaitingStatus from ops.testing import Harness @@ -28,8 +29,6 @@ VM_CPU_CONFIG_NAME, VM_DISK_CONFIG_NAME, Arch, - GitHubOrg, - GitHubRepo, InstanceType, OpenStackCloudsYAML, OpenstackImage, @@ -41,14 +40,12 @@ LogrotateSetupError, MissingMongoDBError, MissingRunnerBinaryError, - OpenStackUnauthorizedError, RunnerError, SubprocessError, TokenError, ) from event_timer import EventTimer, TimerEnableError from firewall import FirewallEntry -from github_type import GitHubRunnerStatus from runner_manager import LXDRunnerManagerConfig, RunnerInfo TEST_PROXY_SERVER_URL = "http://proxy.server:1234" @@ -761,7 +758,6 @@ def test_on_flush_runners_action(self, run, wt, mkdir, rm): pytest.param(ConfigurationError, BlockedStatus, id="charm config error"), pytest.param(TokenError, BlockedStatus, id="github token error"), pytest.param(MissingRunnerBinaryError, MaintenanceStatus, id="runner binary error"), - pytest.param(OpenStackUnauthorizedError, BlockedStatus, id="openstack auth error"), ], ) def test_catch_charm_errors( diff --git a/tests/unit/test_charm_state.py b/tests/unit/test_charm_state.py index d8fdd896d..b7df8a5dc 100644 --- a/tests/unit/test_charm_state.py +++ b/tests/unit/test_charm_state.py @@ -8,20 +8,22 @@ from pathlib import Path from unittest.mock import MagicMock +import github_runner_manager.openstack_cloud import pytest import yaml from charms.data_platform_libs.v0.data_interfaces import DatabaseRequires +from github_runner_manager.types_.github import GitHubOrg, GitHubRepo from pydantic import BaseModel from pydantic.error_wrappers import ValidationError from pydantic.networks import IPv4Address import charm_state -import openstack_cloud from charm_state import ( BASE_IMAGE_CONFIG_NAME, DEBUG_SSH_INTEGRATION_NAME, DENYLIST_CONFIG_NAME, DOCKERHUB_MIRROR_CONFIG_NAME, + GROUP_CONFIG_NAME, IMAGE_INTEGRATION_NAME, LABELS_CONFIG_NAME, OPENSTACK_CLOUDS_YAML_CONFIG_NAME, @@ -41,8 +43,6 @@ CharmState, FirewallEntry, GithubConfig, - GitHubOrg, - GitHubRepo, ImmutableConfigChangedError, LocalLxdRunnerConfig, OpenstackImage, @@ -87,20 +87,21 @@ def test_github_org_path(): assert path == org -def test_parse_github_path_invalid(): +def test_github_config_from_charm_invalud_path(): """ arrange: Create an invalid GitHub path string and runner group name. act: Call parse_github_path with the invalid path string and runner group name. assert: Verify that the function raises CharmConfigInvalidError. """ - path_str = "invalidpath/" - runner_group = "test_group" + mock_charm = MockGithubRunnerCharmFactory() + mock_charm.config[PATH_CONFIG_NAME] = "invalidpath/" + mock_charm.config[GROUP_CONFIG_NAME] = "test_group" with pytest.raises(CharmConfigInvalidError): - charm_state.parse_github_path(path_str, runner_group) + GithubConfig.from_charm(mock_charm) -def test_github_config_from_charm_invalid_path(): +def test_github_config_from_charm_empty_path(): """ arrange: Create a mock CharmBase instance with an empty path configuration. act: Call from_charm method with the mock CharmBase instance. @@ -367,9 +368,9 @@ def test_parse_openstack_clouds_initialize_fail( mock_charm = MockGithubRunnerCharmFactory() mock_charm.config[OPENSTACK_CLOUDS_YAML_CONFIG_NAME] = valid_yaml_config monkeypatch.setattr( - openstack_cloud, + github_runner_manager.openstack_cloud, "initialize", - MagicMock(side_effect=openstack_cloud.OpenStackInvalidConfigError), + MagicMock(side_effect=github_runner_manager.openstack_cloud.OpenStackInvalidConfigError), ) with pytest.raises(CharmConfigInvalidError): diff --git a/tests/unit/test_github_client.py b/tests/unit/test_github_client.py deleted file mode 100644 index 9bd336a03..000000000 --- a/tests/unit/test_github_client.py +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import http -import random -import secrets -from collections import namedtuple -from datetime import datetime, timezone -from unittest.mock import MagicMock -from urllib.error import HTTPError - -import pytest - -from charm_state import GitHubRepo -from errors import JobNotFoundError -from github_client import GithubClient -from github_type import JobConclusion, JobStats - -JobStatsRawData = namedtuple( - "JobStatsRawData", - ["created_at", "started_at", "runner_name", "conclusion", "id"], -) - - -@pytest.fixture(name="job_stats_raw") -def job_stats_fixture() -> JobStatsRawData: - """Create a JobStats object.""" - runner_name = secrets.token_hex(16) - return JobStatsRawData( - created_at="2021-10-01T00:00:00Z", - started_at="2021-10-01T01:00:00Z", - conclusion="success", - runner_name=runner_name, - id=random.randint(1, 1000), - ) - - -@pytest.fixture(name="github_client") -def github_client_fixture(job_stats_raw: JobStatsRawData) -> GithubClient: - """Create a GithubClient object with a mocked GhApi object.""" - gh_client = GithubClient("token") - gh_client._client = MagicMock() - gh_client._client.actions.list_jobs_for_workflow_run.return_value = { - "jobs": [ - { - "created_at": job_stats_raw.created_at, - "started_at": job_stats_raw.started_at, - "runner_name": job_stats_raw.runner_name, - "conclusion": job_stats_raw.conclusion, - "id": job_stats_raw.id, - } - ] - } - - return gh_client - - -def _mock_multiple_pages_for_job_response( - github_client: GithubClient, job_stats_raw: JobStatsRawData, include_runner: bool = True -): - """Mock the list_jobs_for_workflow_run to return multiple pages. - - Args: - github_client: The GithubClient object to mock. - job_stats_raw: The JobStatsRawData object to use for the response. - include_runner: Whether to include the runner in the response for one of the jobs. - """ - no_of_pages = random.choice(range(1, 5)) - no_of_jobs_per_page = random.choice(range(1, 4)) - runner_names = [secrets.token_hex(16) for _ in range(no_of_pages * no_of_jobs_per_page)] - - if include_runner: - runner_names[random.choice(range(no_of_pages))] = job_stats_raw.runner_name - - github_client._client.actions.list_jobs_for_workflow_run.side_effect = [ - { - "jobs": [ - { - "created_at": job_stats_raw.created_at, - "started_at": job_stats_raw.started_at, - "runner_name": runner_names[i * no_of_jobs_per_page + j], - "conclusion": job_stats_raw.conclusion, - "id": job_stats_raw.id, - } - for j in range(no_of_jobs_per_page) - ] - } - for i in range(no_of_pages) - ] + [{"jobs": []}] - - -def test_get_job_info(github_client: GithubClient, job_stats_raw: JobStatsRawData): - """ - arrange: A mocked Github Client that returns one page of jobs containing one job \ - with the runner. - act: Call get_job_info. - assert: The correct JobStats object is returned. - """ - github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) - job_stats = github_client.get_job_info( - path=github_repo, - workflow_run_id=secrets.token_hex(16), - runner_name=job_stats_raw.runner_name, - ) - assert job_stats == JobStats( - created_at=datetime(2021, 10, 1, 0, 0, 0, tzinfo=timezone.utc), - started_at=datetime(2021, 10, 1, 1, 0, 0, tzinfo=timezone.utc), - runner_name=job_stats_raw.runner_name, - conclusion=JobConclusion.SUCCESS, - job_id=job_stats_raw.id, - ) - - -def test_get_job_info_no_conclusion(github_client: GithubClient, job_stats_raw: JobStatsRawData): - """ - arrange: A mocked Github Client that returns one page of jobs containing one job \ - with the runner with conclusion set to None. - act: Call get_job_info. - assert: JobStats object with conclusion set to None is returned. - """ - github_client._client.actions.list_jobs_for_workflow_run.return_value = { - "jobs": [ - { - "created_at": job_stats_raw.created_at, - "started_at": job_stats_raw.started_at, - "runner_name": job_stats_raw.runner_name, - "conclusion": None, - "id": job_stats_raw.id, - } - ] - } - github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) - job_stats = github_client.get_job_info( - path=github_repo, - workflow_run_id=secrets.token_hex(16), - runner_name=job_stats_raw.runner_name, - ) - assert job_stats == JobStats( - created_at=datetime(2021, 10, 1, 0, 0, 0, tzinfo=timezone.utc), - started_at=datetime(2021, 10, 1, 1, 0, 0, tzinfo=timezone.utc), - runner_name=job_stats_raw.runner_name, - conclusion=None, - job_id=job_stats_raw.id, - ) - - -def test_github_api_pagination_multiple_pages( - github_client: GithubClient, job_stats_raw: JobStatsRawData -): - """ - arrange: A mocked Github Client that returns multiple pages of jobs containing \ - one job with the runner. - act: Call get_job_info. - assert: The correct JobStats object is returned. - """ - _mock_multiple_pages_for_job_response( - github_client=github_client, job_stats_raw=job_stats_raw, include_runner=True - ) - - github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) - job_stats = github_client.get_job_info( - path=github_repo, - workflow_run_id=secrets.token_hex(16), - runner_name=job_stats_raw.runner_name, - ) - assert job_stats == JobStats( - created_at=datetime(2021, 10, 1, 0, 0, 0, tzinfo=timezone.utc), - started_at=datetime(2021, 10, 1, 1, 0, 0, tzinfo=timezone.utc), - runner_name=job_stats_raw.runner_name, - conclusion=JobConclusion.SUCCESS, - job_id=job_stats_raw.id, - ) - - -def test_github_api_pagination_job_not_found( - github_client: GithubClient, job_stats_raw: JobStatsRawData -): - """ - arrange: A mocked Github Client that returns multiple pages of jobs containing \ - no job with the runner. - act: Call get_job_info. - assert: An exception is raised. - """ - _mock_multiple_pages_for_job_response( - github_client=github_client, job_stats_raw=job_stats_raw, include_runner=False - ) - - github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) - - with pytest.raises(JobNotFoundError): - github_client.get_job_info( - path=github_repo, - workflow_run_id=secrets.token_hex(16), - runner_name=job_stats_raw.runner_name, - ) - - -def test_github_api_http_error(github_client: GithubClient, job_stats_raw: JobStatsRawData): - github_client._client.actions.list_jobs_for_workflow_run.side_effect = HTTPError( - "http://test.com", 500, "", http.client.HTTPMessage(), None - ) - github_repo = GitHubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) - - with pytest.raises(JobNotFoundError): - github_client.get_job_info( - path=github_repo, - workflow_run_id=secrets.token_hex(16), - runner_name=job_stats_raw.runner_name, - ) diff --git a/tests/unit/test_lxd_runner_manager.py b/tests/unit/test_lxd_runner_manager.py index 36c36df11..215cbe7e0 100644 --- a/tests/unit/test_lxd_runner_manager.py +++ b/tests/unit/test_lxd_runner_manager.py @@ -7,26 +7,29 @@ from pathlib import Path from unittest.mock import MagicMock, call +import github_runner_manager.reactive.runner_manager import pytest +from github_runner_manager.metrics.events import ( + Reconciliation, + RunnerInstalled, + RunnerStart, + RunnerStop, +) +from github_runner_manager.metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME +from github_runner_manager.metrics.storage import MetricsStorage +from github_runner_manager.types_.github import GitHubOrg, GitHubRepo, RunnerApplication from pytest import LogCaptureFixture, MonkeyPatch -import reactive.runner_manager import shared_fs from charm_state import ( Arch, CharmConfig, CharmState, - GitHubOrg, - GitHubRepo, ProxyConfig, ReactiveConfig, VirtualMachineResources, ) from errors import IssueMetricEventError, RunnerBinaryError -from github_type import RunnerApplication -from metrics.events import Reconciliation, RunnerInstalled, RunnerStart, RunnerStop -from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME -from metrics.storage import MetricsStorage from runner import Runner, RunnerStatus from runner_manager import BUILD_IMAGE_SCRIPT_FILENAME, LXDRunnerManager, LXDRunnerManagerConfig from runner_type import RunnerNameByHealth @@ -107,7 +110,7 @@ def runner_manager_fixture(request, tmp_path, monkeypatch, token, charm_state): def issue_event_mock_fixture(monkeypatch: MonkeyPatch) -> MagicMock: """Mock the issue_event function.""" issue_event_mock = MagicMock() - monkeypatch.setattr("metrics.events.issue_event", issue_event_mock) + monkeypatch.setattr("github_runner_manager.metrics.events.issue_event", issue_event_mock) return issue_event_mock @@ -131,7 +134,7 @@ def runner_metrics_fixture(monkeypatch: MonkeyPatch) -> MagicMock: @pytest.fixture(name="reactive_reconcile_mock") def reactive_reconcile_fixture(monkeypatch: MonkeyPatch, tmp_path: Path) -> MagicMock: """Mock the job class.""" - reconcile_mock = MagicMock(spec=reactive.runner_manager.reconcile) + reconcile_mock = MagicMock(spec=github_runner_manager.reactive.runner_manager.reconcile) monkeypatch.setattr("runner_manager.reactive_runner_manager.reconcile", reconcile_mock) reconcile_mock.side_effect = lambda quantity, **kwargs: quantity return reconcile_mock diff --git a/tests/unit/test_openstack_cloud.py b/tests/unit/test_openstack_cloud.py deleted file mode 100644 index 4f599e914..000000000 --- a/tests/unit/test_openstack_cloud.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -from pathlib import Path - -import pytest -import yaml - -import openstack_cloud -from errors import OpenStackInvalidConfigError - - -def test_initialize(clouds_yaml_path: Path, clouds_yaml: dict): - """ - arrange: Mocked clouds.yaml data and path. - act: Call initialize. - assert: The clouds.yaml file is written to disk. - """ - openstack_cloud.initialize(clouds_yaml) - - assert yaml.safe_load(clouds_yaml_path.read_text(encoding="utf-8")) == clouds_yaml - - -@pytest.mark.parametrize( - "invalid_yaml, expected_err_msg", - [ - pytest.param( - {"wrong-key": {"cloud_name": {"auth": {}}}}, "Missing key 'clouds' from config." - ), - pytest.param({}, "Missing key 'clouds' from config."), - pytest.param({"clouds": {}}, "No clouds defined in clouds.yaml."), - ], -) -def test_initialize_validation_error(invalid_yaml: dict, expected_err_msg): - """ - arrange: Mocked clouds.yaml data with invalid data. - act: Call initialize. - assert: InvalidConfigError is raised. - """ - with pytest.raises(OpenStackInvalidConfigError) as exc: - openstack_cloud.initialize(invalid_yaml) - assert expected_err_msg in str(exc) diff --git a/tests/unit/test_runner.py b/tests/unit/test_runner.py index af7954d06..e6d57f305 100644 --- a/tests/unit/test_runner.py +++ b/tests/unit/test_runner.py @@ -8,12 +8,14 @@ from pathlib import Path from unittest.mock import MagicMock, call +import github_runner_manager.metrics.runner_logs import jinja2 import pytest from _pytest.monkeypatch import MonkeyPatch +from github_runner_manager.metrics.storage import MetricsStorage +from github_runner_manager.types_.github import GitHubOrg, GitHubRepo -import metrics.runner_logs -from charm_state import GitHubOrg, GitHubRepo, SSHDebugConnection, VirtualMachineResources +from charm_state import SSHDebugConnection, VirtualMachineResources from errors import ( CreateMetricsStorageError, LxdError, @@ -22,7 +24,6 @@ RunnerRemoveError, ) from lxd import LxdInstance, LxdInstanceFileManager -from metrics.storage import MetricsStorage from runner import DIAG_DIR_PATH, CreateRunnerConfig, Runner, RunnerConfig, RunnerStatus from runner_manager_type import RunnerManagerClients from runner_type import ProxySetting @@ -102,7 +103,9 @@ def create_logs_dir(runner_name: str) -> Path: return target_log_path - create_logs_dir_mock = MagicMock(spec=metrics.runner_logs.create_logs_dir) + create_logs_dir_mock = MagicMock( + spec=github_runner_manager.metrics.runner_logs.create_logs_dir + ) create_logs_dir_mock.side_effect = create_logs_dir monkeypatch.setattr("runner.create_logs_dir", create_logs_dir_mock) @@ -522,7 +525,7 @@ def test_pull_logs(runner: Runner, log_dir_base_path: Path): runner.instance.files.pull_file.assert_has_calls( [ call(str(DIAG_DIR_PATH), str(log_dir_path), is_dir=True), - call(str(metrics.runner_logs.SYSLOG_PATH), str(log_dir_path)), + call(str(github_runner_manager.metrics.runner_logs.SYSLOG_PATH), str(log_dir_path)), ] ) diff --git a/tests/unit/test_runner_scaler.py b/tests/unit/test_runner_scaler.py index 845c8da49..f3199fd99 100644 --- a/tests/unit/test_runner_scaler.py +++ b/tests/unit/test_runner_scaler.py @@ -6,12 +6,16 @@ from unittest.mock import MagicMock import pytest +from github_runner_manager.manager.cloud_runner_manager import CloudRunnerState, InstanceId +from github_runner_manager.manager.github_runner_manager import GitHubRunnerState +from github_runner_manager.manager.runner_manager import ( + FlushMode, + RunnerManager, + RunnerManagerConfig, +) +from github_runner_manager.manager.runner_scaler import RunnerScaler +from github_runner_manager.types_.github import GitHubPath, GitHubRepo -from charm_state import GitHubPath, GitHubRepo -from manager.cloud_runner_manager import CloudRunnerState, InstanceId -from manager.github_runner_manager import GitHubRunnerState -from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig -from manager.runner_scaler import RunnerScaler from tests.unit.mock_runner_managers import ( MockCloudRunnerManager, MockGitHubRunnerManager, @@ -58,11 +62,16 @@ def runner_manager_fixture( ) -> RunnerManager: mock_cloud, mock_github = mock_runner_managers monkeypatch.setattr( - "manager.runner_manager.RunnerManager._spawn_runners", mock_runner_manager_spawn_runners + "github_runner_manager.manager.runner_manager.RunnerManager._spawn_runners", + mock_runner_manager_spawn_runners, ) # Patch out the metrics, as metrics has their own tests. - monkeypatch.setattr("manager.runner_manager.github_metrics.job", MagicMock()) - monkeypatch.setattr("manager.runner_manager.runner_metrics.issue_events", MagicMock()) + monkeypatch.setattr( + "github_runner_manager.manager.runner_manager.github_metrics.job", MagicMock() + ) + monkeypatch.setattr( + "github_runner_manager.manager.runner_manager.runner_metrics.issue_events", MagicMock() + ) config = RunnerManagerConfig("mock_token", github_path) runner_manager = RunnerManager("mock_runners", mock_cloud, config) diff --git a/tests/unit/test_shared_fs.py b/tests/unit/test_shared_fs.py index 0c1266566..2a21bf3cc 100644 --- a/tests/unit/test_shared_fs.py +++ b/tests/unit/test_shared_fs.py @@ -7,6 +7,7 @@ import pytest from _pytest.monkeypatch import MonkeyPatch +from github_runner_manager.metrics.storage import MetricsStorage import shared_fs from errors import ( @@ -15,7 +16,6 @@ GetMetricsStorageError, SubprocessError, ) -from metrics.storage import MetricsStorage MOUNTPOINT_FAILURE_EXIT_CODE = 1 From 0e891ccbb5fb37c470a326fa2437c592f418d98b Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Wed, 11 Sep 2024 07:26:11 +0200 Subject: [PATCH 06/10] Fix units with stuck reconciliation (#367) * add timeout command * Check-in panel with average actual reconciliation interval * introduce constant * update dashboard * rename panel using term "Application" * update docs * remove Optional --- docs/reference/cos.md | 2 + src-docs/charm.md | 11 +- src-docs/event_timer.md | 8 +- src/charm.py | 7 +- src/event_timer.py | 11 +- src/grafana_dashboards/metrics.json | 201 ++++++++++++++++++++++------ templates/dispatch-event.service.j2 | 2 +- 7 files changed, 181 insertions(+), 61 deletions(-) diff --git a/docs/reference/cos.md b/docs/reference/cos.md index 898845c3e..799bc8a95 100644 --- a/docs/reference/cos.md +++ b/docs/reference/cos.md @@ -19,6 +19,8 @@ The "GitHub Self-Hosted Runner Metrics" metrics dashboard presents the following - Runner idle duration - Charm reconciliation duration - Job queue duration - how long a job waits in the queue before a runner picks it up + - Max job queue duration by application: Similar to "Job queue duration" panel, but shows maximum durations by charm application. + - Average reconciliation interval: Shows the average time between reconciliation events, broken down by charm application. - Jobs: Displays certain metrics about the jobs executed by the runners. These metrics can be displayed per repository by specifying a regular expression on the `Repository` variable. The following metrics are displayed: - Proportion charts: Share of jobs by completion status, job conclusion, application, repo policy check failure http codes and github events over time. diff --git a/src-docs/charm.md b/src-docs/charm.md index fa38de542..5b1540fee 100644 --- a/src-docs/charm.md +++ b/src-docs/charm.md @@ -15,12 +15,13 @@ Charm for creating and managing GitHub self-hosted runner instances. - **RECONCILE_INTERVAL_CONFIG_NAME** - **TEST_MODE_CONFIG_NAME** - **TOKEN_CONFIG_NAME** +- **RECONCILIATION_INTERVAL_TIMEOUT_FACTOR** - **RECONCILE_RUNNERS_EVENT** - **REACTIVE_MQ_DB_NAME** --- - + ## function `catch_charm_errors` @@ -46,7 +47,7 @@ Catch common errors in charm. --- - + ## function `catch_action_errors` @@ -72,7 +73,7 @@ Catch common errors in actions. --- - + ## class `ReconcileRunnersEvent` Event representing a periodic check to ensure runners are ok. @@ -83,7 +84,7 @@ Event representing a periodic check to ensure runners are ok. --- - + ## class `GithubRunnerCharm` Charm for managing GitHub self-hosted runners. @@ -100,7 +101,7 @@ Charm for managing GitHub self-hosted runners. - `ram_pool_path`: The path to memdisk storage. - `kernel_module_path`: The path to kernel modules. - + ### method `__init__` diff --git a/src-docs/event_timer.md b/src-docs/event_timer.md index 57055e575..7d10d807c 100644 --- a/src-docs/event_timer.md +++ b/src-docs/event_timer.md @@ -109,7 +109,7 @@ Construct the timer manager. --- - + ### method `disable_event_timer` @@ -138,11 +138,7 @@ Disable the systemd timer for the given event. ### method `ensure_event_timer` ```python -ensure_event_timer( - event_name: str, - interval: int, - timeout: Optional[int] = None -) → None +ensure_event_timer(event_name: str, interval: int, timeout: int) → None ``` Ensure that a systemd service and timer are registered to dispatch the given event. diff --git a/src/charm.py b/src/charm.py index 0b17dbf52..9dae00059 100755 --- a/src/charm.py +++ b/src/charm.py @@ -94,6 +94,10 @@ from runner_manager import LXDRunnerManager, LXDRunnerManagerConfig from runner_manager_type import LXDFlushMode +# We assume a stuck reconcile event when it takes longer +# than 10 times a normal interval. Currently, we are only aware of +# https://bugs.launchpad.net/juju/+bug/2055184 causing a stuck reconcile event. +RECONCILIATION_INTERVAL_TIMEOUT_FACTOR = 10 RECONCILE_RUNNERS_EVENT = "reconcile-runners" # This is currently hardcoded and may be moved to a config option in the future. @@ -555,7 +559,8 @@ def _set_reconcile_timer(self) -> None: self._event_timer.ensure_event_timer( event_name="reconcile-runners", interval=int(self.config[RECONCILE_INTERVAL_CONFIG_NAME]), - timeout=int(self.config[RECONCILE_INTERVAL_CONFIG_NAME]) - 1, + timeout=RECONCILIATION_INTERVAL_TIMEOUT_FACTOR + * int(self.config[RECONCILE_INTERVAL_CONFIG_NAME]), ) def _ensure_reconcile_timer_is_active(self) -> None: diff --git a/src/event_timer.py b/src/event_timer.py index da03150ff..ab94c5736 100644 --- a/src/event_timer.py +++ b/src/event_timer.py @@ -5,7 +5,7 @@ import logging import subprocess # nosec B404 from pathlib import Path -from typing import Optional, TypedDict +from typing import TypedDict import jinja2 @@ -107,9 +107,7 @@ def is_active(self, event_name: str) -> bool: return ret_code == 0 - def ensure_event_timer( - self, event_name: str, interval: int, timeout: Optional[int] = None - ) -> None: + def ensure_event_timer(self, event_name: str, interval: int, timeout: int) -> None: """Ensure that a systemd service and timer are registered to dispatch the given event. The interval is how frequently, in minutes, the event should be dispatched. @@ -125,10 +123,7 @@ def ensure_event_timer( Raises: TimerEnableError: Timer cannot be started. Events will be not emitted. """ - if timeout is not None: - timeout_in_secs = timeout * 60 - else: - timeout_in_secs = interval * 30 + timeout_in_secs = timeout * 60 context: EventConfig = { "event": event_name, diff --git a/src/grafana_dashboards/metrics.json b/src/grafana_dashboards/metrics.json index 8582b77ef..0e8728765 100644 --- a/src/grafana_dashboards/metrics.json +++ b/src/grafana_dashboards/metrics.json @@ -24,10 +24,23 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 537, + "id": 567, "links": [], "liveNow": false, "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 10, + "panels": [], + "title": "General", + "type": "row" + }, { "datasource": { "type": "loki", @@ -91,7 +104,7 @@ "h": 8, "w": 12, "x": 0, - "y": 0 + "y": 1 }, "id": 3, "options": { @@ -212,7 +225,7 @@ "h": 8, "w": 12, "x": 12, - "y": 0 + "y": 1 }, "id": 22, "options": { @@ -253,19 +266,6 @@ "title": "Available Runners", "type": "barchart" }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 8 - }, - "id": 10, - "panels": [], - "title": "General", - "type": "row" - }, { "datasource": { "type": "loki", @@ -485,7 +485,7 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\", duration=\"queue_duration\", flavor=\"flavor\" | __error__=`` | event = `runner_start` | flavor =~ `$flavor` | unwrap duration [1h]) by (flavor)", + "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"queue_duration\",flavor=\"flavor\" | __error__=\"\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(flavor)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "{{flavor}}", @@ -493,7 +493,7 @@ "refId": "E" } ], - "title": "Max Job Queue Duration By Flavour", + "title": "Max Job Queue Duration By Application", "type": "timeseries" }, { @@ -789,10 +789,10 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, + "x": 0, "y": 25 }, - "id": 4, + "id": 24, "options": { "legend": { "calcs": [], @@ -812,7 +812,7 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.5,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", + "expr": "quantile_over_time(0.5,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "50%", "queryType": "range", @@ -824,12 +824,12 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.95,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", + "expr": "quantile_over_time(0.95,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "95%", "queryType": "range", - "refId": "C" + "refId": "D" }, { "datasource": { @@ -837,30 +837,151 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.99,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", + "expr": "quantile_over_time(0.99,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "99%", "queryType": "range", - "refId": "D" + "refId": "E" }, { "datasource": { "type": "loki", "uid": "${lokids}" }, - "editorMode": "code", - "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", + "editorMode": "builder", + "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "Max", "queryType": "range", - "refId": "E" + "refId": "C" } ], - "title": "Runner Idle Duration (Percentile)", + "title": "Reconciliation Duration (Percentile)", "type": "timeseries" }, + { + "datasource": { + "type": "loki", + "uid": "${lokids}" + }, + "description": "The average actual reconciliation interval.\n\nUnlike the Reconciliation Duration panel, this panel shows the time between the actual start of reconciliations. This is useful for identifying problems where the reconciliation itself is behaving normally, but the event itself is not being scheduled as expected.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value #A" + }, + "properties": [ + { + "id": "displayName", + "value": "Duration" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 6, + "options": { + "barRadius": 0, + "barWidth": 0.97, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "orientation": "horizontal", + "showValue": "never", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${lokids}" + }, + "editorMode": "code", + "expr": "1 / sum by (flavor) (rate({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\", flavor=\"flavor\" | event=\"reconciliation\" [$__range]))", + "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", + "legendFormat": "", + "queryType": "instant", + "refId": "A" + } + ], + "title": "Average Reconciliation Interval", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "desc": true, + "field": "Value #A" + } + ] + } + } + ], + "type": "barchart" + }, { "datasource": { "type": "loki", @@ -926,7 +1047,7 @@ "x": 12, "y": 33 }, - "id": 6, + "id": 4, "options": { "legend": { "calcs": [], @@ -946,7 +1067,7 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.5,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", + "expr": "quantile_over_time(0.5,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "50%", "queryType": "range", @@ -958,12 +1079,12 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.95,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", + "expr": "quantile_over_time(0.95,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "95%", "queryType": "range", - "refId": "D" + "refId": "C" }, { "datasource": { @@ -971,28 +1092,28 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.99,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", + "expr": "quantile_over_time(0.99,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "99%", "queryType": "range", - "refId": "E" + "refId": "D" }, { "datasource": { "type": "loki", "uid": "${lokids}" }, - "editorMode": "builder", - "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", + "editorMode": "code", + "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "Max", "queryType": "range", - "refId": "C" + "refId": "E" } ], - "title": "Reconciliation Duration (Percentile)", + "title": "Runner Idle Duration (Percentile)", "type": "timeseries" }, { @@ -1924,6 +2045,6 @@ "timepicker": {}, "timezone": "", "title": "GitHub Self-Hosted Runner Metrics", - "version": 14, + "version": 15, "weekStart": "" } diff --git a/templates/dispatch-event.service.j2 b/templates/dispatch-event.service.j2 index 7bef08f96..b4482795f 100644 --- a/templates/dispatch-event.service.j2 +++ b/templates/dispatch-event.service.j2 @@ -4,7 +4,7 @@ Description=Dispatch the {{event}} event on {{unit}} [Service] Type=oneshot # For juju 3 and juju 2 compatibility. The juju-run binary was renamed to juju-exec for juju 3. -ExecStart=/usr/bin/run-one /usr/bin/bash -c '/usr/bin/juju-exec "{{unit}}" "JUJU_DISPATCH_PATH={{event}} ./dispatch" || /usr/bin/juju-run "{{unit}}" "JUJU_DISPATCH_PATH={{event}} ./dispatch"' +ExecStart=/usr/bin/timeout "{{timeout}}" /usr/bin/run-one /usr/bin/bash -c '/usr/bin/juju-exec "{{unit}}" "JUJU_DISPATCH_PATH={{event}} ./dispatch" || /usr/bin/juju-run "{{unit}}" "JUJU_DISPATCH_PATH={{event}} ./dispatch"' [Install] WantedBy=multi-user.target From 83459cc3a024bd041061a4d65e5ef23bc2313afb Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Wed, 11 Sep 2024 11:50:16 +0200 Subject: [PATCH 07/10] Fix missing reconciliation metric output (#368) * pin github_runner_manager version * pin correct commit --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4d219d184..4ec1f116c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,4 @@ PyYAML ==6.0.* pyOpenSSL==24.2.1 kombu==5.4.0 pymongo==4.8.0 -github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@1f310b22b99a94bd5429184191558426b014ee82 +github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@d9013e0795c3265a3379e5d642d1ef3df04bb777 From c535ad3a7ea013ade775a6b934ab3e25c2696de4 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Thu, 12 Sep 2024 07:04:32 +0200 Subject: [PATCH 08/10] Fix SSH health check (#369) * pin fix/heath_check * chore: pin merged commit --------- Co-authored-by: Yanks Yoon --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4ec1f116c..525a0a4cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,4 @@ PyYAML ==6.0.* pyOpenSSL==24.2.1 kombu==5.4.0 pymongo==4.8.0 -github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@d9013e0795c3265a3379e5d642d1ef3df04bb777 +github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@6f0f8679a46b727fddc866f25d8c139672f2a9e1 From 14cd540fda3e85ce31c32a6a1f20dbcd25117b2e Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Thu, 12 Sep 2024 20:12:35 +0200 Subject: [PATCH 09/10] feat: health check (#372) * pin branch * pin correct commit * bump kombu --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 525a0a4cc..6be16381d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,6 @@ cosl ==0.0.15 # juju 3.1.2.0 depends on pyyaml<=6.0 and >=5.1.2 PyYAML ==6.0.* pyOpenSSL==24.2.1 -kombu==5.4.0 +kombu==5.4.1 pymongo==4.8.0 -github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@6f0f8679a46b727fddc866f25d8c139672f2a9e1 +github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@771530633cd8b3cb18cb95399c234c82118b77e2 From e225a87b9b270585db593be1fda488ab24b250d2 Mon Sep 17 00:00:00 2001 From: Yanks Yoon <37652070+yanksyoon@users.noreply.github.com> Date: Fri, 13 Sep 2024 19:30:38 +0800 Subject: [PATCH 10/10] test: run test w/ new lib (#373) * test: run test w/ new lib * chore: replace merged hash --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6be16381d..d49ee5719 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,4 @@ PyYAML ==6.0.* pyOpenSSL==24.2.1 kombu==5.4.1 pymongo==4.8.0 -github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@771530633cd8b3cb18cb95399c234c82118b77e2 +github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@6ee40e296f92f191dd12b1e24a613dfdbdc70f99