diff --git a/docs/reference/cos.md b/docs/reference/cos.md index 898845c3e..799bc8a95 100644 --- a/docs/reference/cos.md +++ b/docs/reference/cos.md @@ -19,6 +19,8 @@ The "GitHub Self-Hosted Runner Metrics" metrics dashboard presents the following - Runner idle duration - Charm reconciliation duration - Job queue duration - how long a job waits in the queue before a runner picks it up + - Max job queue duration by application: Similar to "Job queue duration" panel, but shows maximum durations by charm application. + - Average reconciliation interval: Shows the average time between reconciliation events, broken down by charm application. - Jobs: Displays certain metrics about the jobs executed by the runners. These metrics can be displayed per repository by specifying a regular expression on the `Repository` variable. The following metrics are displayed: - Proportion charts: Share of jobs by completion status, job conclusion, application, repo policy check failure http codes and github events over time. diff --git a/requirements.txt b/requirements.txt index 8a1163473..1c6394e6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,11 +6,12 @@ ops>=2.8 pylxd @ git+https://github.com/canonical/pylxd requests typing-extensions -cryptography <=43.0.0 +cryptography <=43.0.1 pydantic ==1.10.17 cosl ==0.0.15 # juju 3.1.2.0 depends on pyyaml<=6.0 and >=5.1.2 PyYAML ==6.0.* pyOpenSSL==24.2.1 -kombu==5.4.0 +kombu==5.4.1 pymongo==4.8.0 +github_runner_manager @ git+https://github.com/canonical/github-runner-manager.git@6ee40e296f92f191dd12b1e24a613dfdbdc70f99 diff --git a/scripts/reactive_runner.py b/scripts/reactive_runner.py deleted file mode 100644 index e9b996ff6..000000000 --- a/scripts/reactive_runner.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Script to spawn a reactive runner process.""" -import logging -import os -import sys - -from reactive.consumer import consume -from reactive.runner_manager import MQ_URI_ENV_VAR, QUEUE_NAME_ENV_VAR - - -def setup_root_logging() -> None: - """Set up logging for the reactive runner process.""" - # setup root logger to log in a file which will be picked up by grafana agent and sent to Loki - logging.basicConfig( - stream=sys.stdout, - level=logging.DEBUG, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - ) - - -def main() -> None: - """Spawn a process that consumes a message from the queue to create a runner. - - Raises: - ValueError: If the required environment variables are not set - """ - mq_uri = os.environ.get(MQ_URI_ENV_VAR) - queue_name = os.environ.get(QUEUE_NAME_ENV_VAR) - - if not mq_uri: - raise ValueError( - f"Missing {MQ_URI_ENV_VAR} environment variable. " - "Please set it to the message queue URI." - ) - - if not queue_name: - raise ValueError( - f"Missing {QUEUE_NAME_ENV_VAR} environment variable. " - "Please set it to the name of the queue." - ) - - setup_root_logging() - consume(mq_uri, queue_name) - - -if __name__ == "__main__": - main() diff --git a/src-docs/charm.md b/src-docs/charm.md index c03d1411b..5b1540fee 100644 --- a/src-docs/charm.md +++ b/src-docs/charm.md @@ -15,12 +15,13 @@ Charm for creating and managing GitHub self-hosted runner instances. - **RECONCILE_INTERVAL_CONFIG_NAME** - **TEST_MODE_CONFIG_NAME** - **TOKEN_CONFIG_NAME** +- **RECONCILIATION_INTERVAL_TIMEOUT_FACTOR** - **RECONCILE_RUNNERS_EVENT** - **REACTIVE_MQ_DB_NAME** --- - + ## function `catch_charm_errors` @@ -46,7 +47,7 @@ Catch common errors in charm. --- - + ## function `catch_action_errors` @@ -72,7 +73,7 @@ Catch common errors in actions. --- - + ## class `ReconcileRunnersEvent` Event representing a periodic check to ensure runners are ok. @@ -83,7 +84,7 @@ Event representing a periodic check to ensure runners are ok. --- - + ## class `GithubRunnerCharm` Charm for managing GitHub self-hosted runners. @@ -100,7 +101,7 @@ Charm for managing GitHub self-hosted runners. - `ram_pool_path`: The path to memdisk storage. - `kernel_module_path`: The path to kernel modules. - + ### method `__init__` diff --git a/src-docs/charm_state.py.md b/src-docs/charm_state.md similarity index 71% rename from src-docs/charm_state.py.md rename to src-docs/charm_state.md index 5783f6821..9b4889d5e 100644 --- a/src-docs/charm_state.py.md +++ b/src-docs/charm_state.md @@ -2,7 +2,7 @@ -# module `charm_state.py` +# module `charm_state` State of the Charm. **Global Variables** @@ -33,49 +33,100 @@ State of the Charm. - **COS_AGENT_INTEGRATION_NAME** - **DEBUG_SSH_INTEGRATION_NAME** - **IMAGE_INTEGRATION_NAME** +- **MONGO_DB_INTEGRATION_NAME** - **LTS_IMAGE_VERSION_TAG_MAP** + +--- + + + +## class `AnyHttpsUrl` +Represents an HTTPS URL. + + + +**Attributes:** + + - `allowed_schemes`: Allowed schemes for the URL. + + + + + --- - + + +## class `GithubConfig` +Charm configuration related to GitHub. -## function `parse_github_path` + + +**Attributes:** + + - `token`: The Github API access token (PAT). + - `path`: The Github org/repo path. + + + +### method `__init__` ```python -parse_github_path(path_str: str, runner_group: str) → GithubOrg | GithubRepo +__init__(token: str, path: GitHubOrg | GitHubRepo) → None ``` -Parse GitHub path. + + + + + + + +--- + + + +### classmethod `from_charm` + +```python +from_charm(charm: CharmBase) → GithubConfig +``` + +Get github related charm configuration values from charm. **Args:** - - `path_str`: GitHub path in string format. - - `runner_group`: Runner group name for GitHub organization. If the path is a repository this argument is ignored. + - `charm`: The charm instance. **Raises:** - - `CharmConfigInvalidError`: if an invalid path string was given. + - `CharmConfigInvalidError`: If an invalid configuration value was set. **Returns:** - GithubPath object representing the GitHub repository, or the GitHub organization with runner group information. + The parsed GitHub configuration values. --- -## class `AnyHttpsUrl` -Represents an HTTPS URL. + + +## class `VirtualMachineResources` +Virtual machine resource configuration. **Attributes:** - - `allowed_schemes`: Allowed schemes for the URL. + - `cpu`: Number of vCPU for the virtual machine. + - `memory`: Amount of memory for the virtual machine. + - `disk`: Amount of disk for the virtual machine. @@ -83,6 +134,8 @@ Represents an HTTPS URL. --- + + ## class `Arch` Supported system architectures. @@ -99,15 +152,17 @@ Supported system architectures. --- -## class `BaseImage` -The ubuntu OS base image to build and deploy runners on. + + +## class `RunnerStorage` +Supported storage as runner disk. **Attributes:** - - `JAMMY`: The jammy ubuntu LTS image. - - `NOBLE`: The noble ubuntu LTS image. + - `JUJU_STORAGE`: Represents runner storage from Juju storage. + - `MEMORY`: Represents tempfs storage (ramdisk). @@ -115,64 +170,80 @@ The ubuntu OS base image to build and deploy runners on. --- -## class `CharmConfig` -General charm configuration. + -Some charm configurations are grouped into other configuration models. +## class `InstanceType` +Type of instance for runner. **Attributes:** - - `denylist`: List of IPv4 to block the runners from accessing. - - `dockerhub_mirror`: Private docker registry as dockerhub mirror for the runners to use. - - `labels`: Additional runner labels to append to default (i.e. os, flavor, architecture). - - `openstack_clouds_yaml`: The openstack clouds.yaml configuration. - - `path`: GitHub repository path in the format '/', or the GitHub organization name. - - `reconcile_interval`: Time between each reconciliation of runners in minutes. - - `repo_policy_compliance`: Configuration for the repo policy compliance service. - - `token`: GitHub personal access token for GitHub API. + - `LOCAL_LXD`: LXD instance on the local juju machine. + - `OPENSTACK`: OpenStack instance on a cloud. + --- - + + +## class `CharmConfigInvalidError` +Raised when charm config is invalid. + -### classmethod `check_reconcile_interval` + +**Attributes:** + + - `msg`: Explanation of the error. + + + +### method `__init__` ```python -check_reconcile_interval(reconcile_interval: int) → int +__init__(msg: str) ``` -Validate the general charm configuration. +Initialize a new instance of the CharmConfigInvalidError exception. **Args:** - - `reconcile_interval`: The value of reconcile_interval passed to class instantiation. + - `msg`: Explanation of the error. -**Raises:** + + +--- + + + +## class `RepoPolicyComplianceConfig` +Configuration for the repo policy compliance service. + + + +**Attributes:** - - `ValueError`: if an invalid reconcile_interval value of less than 2 has been passed. + - `token`: Token for the repo policy compliance service. + - `url`: URL of the repo policy compliance service. -**Returns:** - The validated reconcile_interval value. --- - + ### classmethod `from_charm` ```python -from_charm(charm: CharmBase) → CharmConfig +from_charm(charm: CharmBase) → RepoPolicyComplianceConfig ``` Initialize the config from charm. @@ -187,121 +258,96 @@ Initialize the config from charm. **Raises:** - - `CharmConfigInvalidError`: If any invalid configuration has been set on the charm. + - `CharmConfigInvalidError`: If an invalid configuration was set. **Returns:** - Current config of the charm. + Current repo-policy-compliance config. --- -## class `CharmConfigInvalidError` -Raised when charm config is invalid. + + +## class `OpenStackCloudsYAML` +The OpenStack clouds YAML dict mapping. **Attributes:** - - `msg`: Explanation of the error. - - - -### function `__init__` - -```python -__init__(msg: str) -``` - -Initialize a new instance of the CharmConfigInvalidError exception. - + - `clouds`: The map of cloud name to cloud connection info. -**Args:** - - - `msg`: Explanation of the error. +--- + ---- +## class `CharmConfig` +General charm configuration. -## class `CharmState` -The charm state. +Some charm configurations are grouped into other configuration models. **Attributes:** - - `arch`: The underlying compute architecture, i.e. x86_64, amd64, arm64/aarch64. - - `charm_config`: Configuration of the juju charm. - - `is_metrics_logging_available`: Whether the charm is able to issue metrics. - - `proxy_config`: Proxy-related configuration. - - `instance_type`: The type of instances, e.g., local lxd, openstack. - - `runner_config`: The charm configuration related to runner VM configuration. - - `ssh_debug_connections`: SSH debug connections configuration information. + - `denylist`: List of IPv4 to block the runners from accessing. + - `dockerhub_mirror`: Private docker registry as dockerhub mirror for the runners to use. + - `labels`: Additional runner labels to append to default (i.e. os, flavor, architecture). + - `openstack_clouds_yaml`: The openstack clouds.yaml configuration. + - `path`: GitHub repository path in the format '/', or the GitHub organization name. + - `reconcile_interval`: Time between each reconciliation of runners in minutes. + - `repo_policy_compliance`: Configuration for the repo policy compliance service. + - `token`: GitHub personal access token for GitHub API. --- - + -### classmethod `from_charm` +### classmethod `check_reconcile_interval` ```python -from_charm(charm: CharmBase) → CharmState +check_reconcile_interval(reconcile_interval: int) → int ``` -Initialize the state from charm. +Validate the general charm configuration. **Args:** - - `charm`: The charm instance. + - `reconcile_interval`: The value of reconcile_interval passed to class instantiation. **Raises:** - - `CharmConfigInvalidError`: If an invalid configuration was set. + - `ValueError`: if an invalid reconcile_interval value of less than 2 has been passed. **Returns:** - Current state of the charm. - - ---- - -## class `GithubConfig` -Charm configuration related to GitHub. - - - -**Attributes:** - - - `token`: The Github API access token (PAT). - - `path`: The Github org/repo path. - - - + The validated reconcile_interval value. --- - + ### classmethod `from_charm` ```python -from_charm(charm: CharmBase) → GithubConfig +from_charm(charm: CharmBase) → CharmConfig ``` -Get github related charm configuration values from charm. +Initialize the config from charm. @@ -313,123 +359,128 @@ Get github related charm configuration values from charm. **Raises:** - - `CharmConfigInvalidError`: If an invalid configuration value was set. + - `CharmConfigInvalidError`: If any invalid configuration has been set on the charm. **Returns:** - The parsed GitHub configuration values. + Current config of the charm. --- -## class `GithubOrg` -Represent GitHub organization. + + +## class `BaseImage` +The ubuntu OS base image to build and deploy runners on. **Attributes:** - - `org`: Name of the GitHub organization. - - `group`: Runner group to spawn the runners in. + - `JAMMY`: The jammy ubuntu LTS image. + - `NOBLE`: The noble ubuntu LTS image. + --- - + -### function `path` +## class `OpenstackImage` +OpenstackImage from image builder relation data. -```python -path() → str -``` -Return a string representing the path. +**Attributes:** + + - `id`: The OpenStack image ID. + - `tags`: Image tags, e.g. jammy -**Returns:** - Path to the GitHub entity. --- -## class `GithubRepo` -Represent GitHub repository. + +### classmethod `from_charm` +```python +from_charm(charm: CharmBase) → OpenstackImage | None +``` -**Attributes:** +Initialize the OpenstackImage info from relation data. + +None represents relation not established. None values for id/tags represent image not yet ready but the relation exists. + + + +**Args:** - - `owner`: Owner of the GitHub repository. - - `repo`: Name of the GitHub repository. + - `charm`: The charm instance. +**Returns:** + OpenstackImage metadata from charm relation data. + --- - + -### function `path` +## class `OpenstackRunnerConfig` +Runner configuration for OpenStack Instances. -```python -path() → str -``` -Return a string representing the path. +**Attributes:** + + - `virtual_machines`: Number of virtual machine-based runner to spawn. + - `openstack_flavor`: flavor on openstack to use for virtual machines. + - `openstack_network`: Network on openstack to use for virtual machines. + - `openstack_image`: Openstack image to use for virtual machines. -**Returns:** - Path to the GitHub entity. --- -## class `ImmutableConfigChangedError` -Represents an error when changing immutable charm state. + - - -### function `__init__` +### classmethod `from_charm` ```python -__init__(msg: str) +from_charm(charm: CharmBase) → OpenstackRunnerConfig ``` -Initialize a new instance of the ImmutableConfigChangedError exception. +Initialize the config from charm. **Args:** - - `msg`: Explanation of the error. - - - - - ---- - -## class `InstanceType` -Type of instance for runner. + - `charm`: The charm instance. -**Attributes:** +**Raises:** - - `LOCAL_LXD`: LXD instance on the local juju machine. - - `OPENSTACK`: OpenStack instance on a cloud. + - `CharmConfigInvalidError`: Error with charm configuration virtual-machines not of int type. +**Returns:** + Openstack runner config of the charm. --- + + ## class `LocalLxdRunnerConfig` Runner configurations for local LXD instances. @@ -447,7 +498,7 @@ Runner configurations for local LXD instances. --- - + ### classmethod `check_virtual_machine_resources` @@ -478,7 +529,7 @@ Validate the virtual_machine_resources field values. --- - + ### classmethod `check_virtual_machines` @@ -507,7 +558,7 @@ Validate the virtual machines configuration value. --- - + ### classmethod `from_charm` @@ -537,73 +588,71 @@ Initialize the config from charm. --- -## class `OpenstackImage` -OpenstackImage from image builder relation data. + + +## class `ProxyConfig` +Proxy configuration. **Attributes:** - - `id`: The OpenStack image ID. - - `tags`: Image tags, e.g. jammy + - `aproxy_address`: The address of aproxy snap instance if use_aproxy is enabled. + - `http`: HTTP proxy address. + - `https`: HTTPS proxy address. + - `no_proxy`: Comma-separated list of hosts that should not be proxied. + - `use_aproxy`: Whether aproxy should be used for the runners. +--- + +#### property aproxy_address + +Return the aproxy address. + --- - + -### classmethod `from_charm` +### classmethod `check_use_aproxy` ```python -from_charm(charm: CharmBase) → OpenstackImage | None +check_use_aproxy(use_aproxy: bool, values: dict) → bool ``` -Initialize the OpenstackImage info from relation data. - -None represents relation not established. None values for id/tags represent image not yet ready but the relation exists. +Validate the proxy configuration. **Args:** - - `charm`: The charm instance. - - - -**Returns:** - OpenstackImage metadata from charm relation data. - - ---- - -## class `OpenstackRunnerConfig` -Runner configuration for OpenStack Instances. + - `use_aproxy`: Value of use_aproxy variable. + - `values`: Values in the pydantic model. -**Attributes:** +**Raises:** - - `virtual_machines`: Number of virtual machine-based runner to spawn. - - `openstack_flavor`: flavor on openstack to use for virtual machines. - - `openstack_network`: Network on openstack to use for virtual machines. - - `openstack_image`: Openstack image to use for virtual machines. + - `ValueError`: if use_aproxy was set but no http/https was passed. +**Returns:** + Validated use_aproxy value. --- - + ### classmethod `from_charm` ```python -from_charm(charm: CharmBase) → OpenstackRunnerConfig +from_charm(charm: CharmBase) → ProxyConfig ``` -Initialize the config from charm. +Initialize the proxy config from charm. @@ -613,81 +662,73 @@ Initialize the config from charm. -**Raises:** - - - `CharmConfigInvalidError`: Error with charm configuration virtual-machines not of int type. - - - **Returns:** - Openstack runner config of the charm. + Current proxy config of the charm. --- -## class `ProxyConfig` -Proxy configuration. + + +## class `UnsupportedArchitectureError` +Raised when given machine charm architecture is unsupported. **Attributes:** - - `aproxy_address`: The address of aproxy snap instance if use_aproxy is enabled. - - `http`: HTTP proxy address. - - `https`: HTTPS proxy address. - - `no_proxy`: Comma-separated list of hosts that should not be proxied. - - `use_aproxy`: Whether aproxy should be used for the runners. + - `arch`: The current machine architecture. + ---- +### method `__init__` -#### property aproxy_address +```python +__init__(arch: str) → None +``` -Return the aproxy address. +Initialize a new instance of the CharmConfigInvalidError exception. ---- +**Args:** + + - `arch`: The current machine architecture. - -### classmethod `check_use_aproxy` -```python -check_use_aproxy(use_aproxy: bool, values: dict) → bool -``` -Validate the proxy configuration. +--- + -**Args:** - - - `use_aproxy`: Value of use_aproxy variable. - - `values`: Values in the pydantic model. +## class `SSHDebugConnection` +SSH connection information for debug workflow. -**Raises:** +**Attributes:** - - `ValueError`: if use_aproxy was set but no http/https was passed. + - `host`: The SSH relay server host IP address inside the VPN. + - `port`: The SSH relay server port. + - `rsa_fingerprint`: The host SSH server public RSA key fingerprint. + - `ed25519_fingerprint`: The host SSH server public ed25519 key fingerprint. -**Returns:** - Validated use_aproxy value. --- - + ### classmethod `from_charm` ```python -from_charm(charm: CharmBase) → ProxyConfig +from_charm(charm: CharmBase) → list['SSHDebugConnection'] ``` -Initialize the proxy config from charm. +Initialize the SSHDebugInfo from charm relation data. @@ -698,65 +739,77 @@ Initialize the proxy config from charm. **Returns:** - Current proxy config of the charm. + List of connection information for ssh debug access. --- -## class `RepoPolicyComplianceConfig` -Configuration for the repo policy compliance service. + + +## class `ReactiveConfig` +Represents the configuration for reactive scheduling. **Attributes:** - - `token`: Token for the repo policy compliance service. - - `url`: URL of the repo policy compliance service. + - `mq_uri`: The URI of the MQ to use to spawn runners reactively. --- - + -### classmethod `from_charm` +### classmethod `from_database` ```python -from_charm(charm: CharmBase) → RepoPolicyComplianceConfig +from_database(database: DatabaseRequires) → ReactiveConfig | None ``` -Initialize the config from charm. +Initialize the ReactiveConfig from charm config and integration data. **Args:** - - `charm`: The charm instance. + - `database`: The database to fetch integration data from. + + + +**Returns:** + The connection information for the reactive MQ or None if not available. **Raises:** - - `CharmConfigInvalidError`: If an invalid configuration was set. + - `MissingMongoDBError`: If the information on howto access MongoDB is missing in the integration data. +--- -**Returns:** - Current repo-policy-compliance config. + +## class `ImmutableConfigChangedError` +Represents an error when changing immutable charm state. ---- + -## class `RunnerStorage` -Supported storage as runner disk. +### method `__init__` +```python +__init__(msg: str) +``` +Initialize a new instance of the ImmutableConfigChangedError exception. -**Attributes:** + + +**Args:** - - `JUJU_STORAGE`: Represents runner storage from Juju storage. - - `MEMORY`: Represents tempfs storage (ramdisk). + - `msg`: Explanation of the error. @@ -764,90 +817,76 @@ Supported storage as runner disk. --- -## class `SSHDebugConnection` -SSH connection information for debug workflow. + + +## class `CharmState` +The charm state. **Attributes:** - - `host`: The SSH relay server host IP address inside the VPN. - - `port`: The SSH relay server port. - - `rsa_fingerprint`: The host SSH server public RSA key fingerprint. - - `ed25519_fingerprint`: The host SSH server public ed25519 key fingerprint. - - - - ---- + - `arch`: The underlying compute architecture, i.e. x86_64, amd64, arm64/aarch64. + - `charm_config`: Configuration of the juju charm. + - `is_metrics_logging_available`: Whether the charm is able to issue metrics. + - `proxy_config`: Proxy-related configuration. + - `instance_type`: The type of instances, e.g., local lxd, openstack. + - `reactive_config`: The charm configuration related to reactive spawning mode. + - `runner_config`: The charm configuration related to runner VM configuration. + - `ssh_debug_connections`: SSH debug connections configuration information. - + -### classmethod `from_charm` +### method `__init__` ```python -from_charm(charm: CharmBase) → list['SSHDebugConnection'] +__init__( + arch: Arch, + is_metrics_logging_available: bool, + proxy_config: ProxyConfig, + instance_type: InstanceType, + charm_config: CharmConfig, + runner_config: OpenstackRunnerConfig | LocalLxdRunnerConfig, + reactive_config: ReactiveConfig | None, + ssh_debug_connections: list[SSHDebugConnection] +) → None ``` -Initialize the SSHDebugInfo from charm relation data. -**Args:** - - - `charm`: The charm instance. - -**Returns:** - List of connection information for ssh debug access. --- -## class `UnsupportedArchitectureError` -Raised when given machine charm architecture is unsupported. - + - -**Attributes:** - - - `arch`: The current machine architecture. - - - -### function `__init__` +### classmethod `from_charm` ```python -__init__(arch: str) → None +from_charm(charm: CharmBase, database: DatabaseRequires) → CharmState ``` -Initialize a new instance of the CharmConfigInvalidError exception. +Initialize the state from charm. **Args:** - - `arch`: The current machine architecture. - - - - - ---- - -## class `VirtualMachineResources` -Virtual machine resource configuration. + - `charm`: The charm instance. + - `database`: The database instance. -**Attributes:** +**Raises:** - - `cpu`: Number of vCPU for the virtual machine. - - `memory`: Amount of memory for the virtual machine. - - `disk`: Amount of disk for the virtual machine. + - `CharmConfigInvalidError`: If an invalid configuration was set. +**Returns:** + Current state of the charm. diff --git a/src-docs/errors.md b/src-docs/errors.md index cf7cde565..c61dd8410 100644 --- a/src-docs/errors.md +++ b/src-docs/errors.md @@ -7,39 +7,6 @@ Errors used by the charm. ---- - - - -## class `RunnerError` -Generic runner error as base exception. - - - - - ---- - - - -## class `RunnerExecutionError` -Error for executing commands on runner. - - - - - ---- - - - -## class `RunnerFileLoadError` -Error for loading file on runner. - - - - - --- @@ -55,8 +22,8 @@ Error for runner creation failure. -## class `RunnerRemoveError` -Error for runner removal failure. +## class `RunnerFileLoadError` +Error for loading file on runner. @@ -66,8 +33,8 @@ Error for runner removal failure. -## class `RunnerStartError` -Error for runner start failure. +## class `RunnerRemoveError` +Error for runner removal failure. @@ -99,6 +66,17 @@ Error for setting up aproxy. +## class `MissingServerConfigError` +Error for unable to create runner due to missing server configurations. + + + + + +--- + + + ## class `MissingRunnerBinaryError` Error for missing runner binary. @@ -108,7 +86,7 @@ Error for missing runner binary. --- - + ## class `ConfigurationError` Error for juju configuration. @@ -119,7 +97,7 @@ Error for juju configuration. --- - + ## class `MissingMongoDBError` Error for missing integration data. @@ -130,7 +108,7 @@ Error for missing integration data. --- - + ## class `LxdError` Error for executing LXD actions. @@ -141,7 +119,7 @@ Error for executing LXD actions. --- - + ## class `SubprocessError` Error for Subprocess calls. @@ -155,7 +133,7 @@ Error for Subprocess calls. - `stdout`: Content of stdout of the subprocess. - `stderr`: Content of stderr of the subprocess. - + ### method `__init__` @@ -185,7 +163,7 @@ Construct the subprocess error. --- - + ## class `IssueMetricEventError` Represents an error when issuing a metric event. @@ -196,7 +174,7 @@ Represents an error when issuing a metric event. --- - + ## class `LogrotateSetupError` Represents an error raised when logrotate cannot be setup. @@ -205,17 +183,6 @@ Represents an error raised when logrotate cannot be setup. ---- - - - -## class `MetricsStorageError` -Base class for all metrics storage errors. - - - - - --- @@ -231,50 +198,6 @@ Base class for all shared filesystem errors. -## class `CreateMetricsStorageError` -Represents an error when the metrics storage could not be created. - - - - - ---- - - - -## class `DeleteMetricsStorageError` -Represents an error when the metrics storage could not be deleted. - - - - - ---- - - - -## class `GetMetricsStorageError` -Represents an error when the metrics storage could not be retrieved. - - - - - ---- - - - -## class `QuarantineMetricsStorageError` -Represents an error when the metrics storage could not be quarantined. - - - - - ---- - - - ## class `SharedFilesystemMountError` Represents an error related to the mounting of the shared filesystem. @@ -284,84 +207,7 @@ Represents an error related to the mounting of the shared filesystem. --- - - -## class `RunnerMetricsError` -Base class for all runner metrics errors. - - - - - ---- - - - -## class `CorruptMetricDataError` -Represents an error with the data being corrupt. - - - - - ---- - - - -## class `GithubMetricsError` -Base class for all github metrics errors. - - - - - ---- - - - -## class `GithubClientError` -Base class for all github client errors. - - - - - ---- - - - -## class `GithubApiError` -Represents an error when the GitHub API returns an error. - - - - - ---- - - - -## class `TokenError` -Represents an error when the token is invalid or has not enough permissions. - - - - - ---- - - - -## class `JobNotFoundError` -Represents an error when the job could not be found on GitHub. - - - - - ---- - - + ## class `RunnerLogsError` Base class for all runner logs errors. @@ -370,58 +216,3 @@ Base class for all runner logs errors. ---- - - - -## class `OpenStackError` -Base class for OpenStack errors. - - - - - ---- - - - -## class `OpenStackInvalidConfigError` -Represents an invalid OpenStack configuration. - - - - - ---- - - - -## class `OpenStackUnauthorizedError` -Represents an unauthorized connection to OpenStack. - - - - - ---- - - - -## class `SSHError` -Represents an error while interacting with SSH. - - - - - ---- - - - -## class `KeyfileError` -Represents missing keyfile for SSH. - - - - - diff --git a/src-docs/event_timer.md b/src-docs/event_timer.md index 57055e575..7d10d807c 100644 --- a/src-docs/event_timer.md +++ b/src-docs/event_timer.md @@ -109,7 +109,7 @@ Construct the timer manager. --- - + ### method `disable_event_timer` @@ -138,11 +138,7 @@ Disable the systemd timer for the given event. ### method `ensure_event_timer` ```python -ensure_event_timer( - event_name: str, - interval: int, - timeout: Optional[int] = None -) → None +ensure_event_timer(event_name: str, interval: int, timeout: int) → None ``` Ensure that a systemd service and timer are registered to dispatch the given event. diff --git a/src-docs/github_client.md b/src-docs/github_client.md index 6cd298c52..679c9f907 100644 --- a/src-docs/github_client.md +++ b/src-docs/github_client.md @@ -8,122 +8,26 @@ GitHub API client. Migrate to PyGithub in the future. PyGithub is still lacking some API such as remove token for runner. ---- - - - -## function `catch_http_errors` - -```python -catch_http_errors( - func: Callable[~ParamT, ~ReturnT] -) → Callable[~ParamT, ~ReturnT] -``` - -Catch HTTP errors and raise custom exceptions. - - - -**Args:** - - - `func`: The target function to catch common errors for. - - - -**Returns:** - The decorated function. - --- - + ## class `GithubClient` GitHub API client. - - -### method `__init__` - -```python -__init__(token: str) -``` - -Instantiate the GiHub API client. - - - -**Args:** - - - `token`: GitHub personal token for API requests. - --- - - -### method `delete_runner` - -```python -delete_runner(path: GithubOrg | GithubRepo, runner_id: int) → None -``` - -Delete the self-hosted runner from GitHub. - - - -**Args:** - - - `path`: GitHub repository path in the format '/', or the GitHub organization name. - - `runner_id`: Id of the runner. - ---- - - - -### method `get_job_info` - -```python -get_job_info( - path: GithubRepo, - workflow_run_id: str, - runner_name: str -) → JobStats -``` - -Get information about a job for a specific workflow run. - - - -**Args:** - - - `path`: GitHub repository path in the format '/'. - - `workflow_run_id`: Id of the workflow run. - - `runner_name`: Name of the runner. - - - -**Raises:** - - - `TokenError`: if there was an error with the Github token crdential provided. - - `JobNotFoundError`: If no jobs were found. - - - -**Returns:** - Job information. - ---- - - + ### method `get_runner_application` ```python get_runner_application( - path: GithubOrg | GithubRepo, + path: GitHubOrg | GitHubRepo, arch: Arch, os: str = 'linux' ) → RunnerApplication @@ -150,73 +54,4 @@ Get runner application available for download for given arch. **Returns:** The runner application. ---- - - - -### method `get_runner_github_info` - -```python -get_runner_github_info(path: GithubOrg | GithubRepo) → list[SelfHostedRunner] -``` - -Get runner information on GitHub under a repo or org. - - - -**Args:** - - - `path`: GitHub repository path in the format '/', or the GitHub organization name. - - - -**Returns:** - List of runner information. - ---- - - - -### method `get_runner_registration_token` - -```python -get_runner_registration_token(path: GithubOrg | GithubRepo) → str -``` - -Get token from GitHub used for registering runners. - - - -**Args:** - - - `path`: GitHub repository path in the format '/', or the GitHub organization name. - - - -**Returns:** - The registration token. - ---- - - - -### method `get_runner_remove_token` - -```python -get_runner_remove_token(path: GithubOrg | GithubRepo) → str -``` - -Get token from GitHub used for removing runners. - - - -**Args:** - - - `path`: The Github org/repo path. - - - -**Returns:** - The removing token. - diff --git a/src-docs/openstack_cloud.md b/src-docs/openstack_cloud.md index 4d82f5359..34aa3f26f 100644 --- a/src-docs/openstack_cloud.md +++ b/src-docs/openstack_cloud.md @@ -7,7 +7,10 @@ Module for managing Openstack cloud. **Global Variables** --------------- -- **openstack_manager**: # Copyright 2024 Canonical Ltd. +- **openstack_cloud**: # Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +- **openstack_runner_manager**: # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. diff --git a/src-docs/openstack_cloud.openstack_manager.md b/src-docs/openstack_cloud.openstack_manager.md index a0f0a2531..115eec05b 100644 --- a/src-docs/openstack_cloud.openstack_manager.md +++ b/src-docs/openstack_cloud.openstack_manager.md @@ -27,7 +27,7 @@ create_instance_config( app_name: str, unit_num: int, image_id: str, - path: GithubOrg | GithubRepo, + path: GitHubOrg | GitHubRepo, labels: Iterable[str], registration_token: str ) → InstanceConfig @@ -75,7 +75,7 @@ The configuration values for creating a single runner instance. ```python __init__( - github_path: GithubOrg | GithubRepo, + github_path: GitHubOrg | GitHubRepo, image_id: str, labels: Iterable[str], name: str, diff --git a/src-docs/openstack_cloud.openstack_runner_manager.md b/src-docs/openstack_cloud.openstack_runner_manager.md index 752e2f9d3..64e7ce91d 100644 --- a/src-docs/openstack_cloud.openstack_runner_manager.md +++ b/src-docs/openstack_cloud.openstack_runner_manager.md @@ -17,7 +17,7 @@ Manager for self-hosted runner on OpenStack. --- - + ## class `OpenStackCloudConfig` Configuration for OpenStack cloud authorisation information. @@ -47,7 +47,7 @@ __init__(clouds_config: dict[str, dict], cloud: str) → None --- - + ## class `OpenStackServerConfig` Configuration for OpenStack server. @@ -78,9 +78,9 @@ __init__(image: str, flavor: str, network: str) → None --- - + -## class `OpenstackRunnerManager` +## class `OpenStackRunnerManager` Manage self-hosted runner on OpenStack cloud. @@ -89,15 +89,16 @@ Manage self-hosted runner on OpenStack cloud. - `name_prefix`: The name prefix of the runners created. - + ### method `__init__` ```python __init__( + manager_name: str, prefix: str, cloud_config: OpenStackCloudConfig, - server_config: OpenStackServerConfig, + server_config: OpenStackServerConfig | None, runner_config: GitHubRunnerConfig, service_config: SupportServiceConfig ) → None @@ -109,9 +110,10 @@ Construct the object. **Args:** + - `manager_name`: A name to identify this manager. - `prefix`: The prefix to runner name. - `cloud_config`: The configuration for OpenStack authorisation. - - `server_config`: The configuration for creating OpenStack server. + - `server_config`: The configuration for creating OpenStack server. Unable to create runner if None. - `runner_config`: The configuration for the runner. - `service_config`: The configuration of supporting services of the runners. @@ -131,7 +133,7 @@ The prefix of runner names. --- - + ### method `cleanup` @@ -154,7 +156,7 @@ Cleanup runner and resource on the cloud. --- - + ### method `create_runner` @@ -174,6 +176,7 @@ Create a self-hosted runner. **Raises:** + - `MissingServerConfigError`: Unable to create runner due to missing configuration. - `RunnerCreateError`: Unable to create runner due to OpenStack issues. @@ -183,7 +186,7 @@ Create a self-hosted runner. --- - + ### method `delete_runner` @@ -207,7 +210,7 @@ Delete self-hosted runners. --- - + ### method `flush_runners` @@ -230,7 +233,7 @@ Remove idle and/or busy runners. --- - + ### method `get_runner` @@ -253,7 +256,7 @@ Get a self-hosted runner by instance id. --- - + ### method `get_runners` diff --git a/src-docs/runner.md b/src-docs/runner.md index b513ad697..d7bfb93c1 100644 --- a/src-docs/runner.md +++ b/src-docs/runner.md @@ -17,7 +17,7 @@ The `RunnerManager` class from `runner_manager.py` creates and manages a collect --- - + ## class `Snap` This class represents a snap installation. @@ -36,7 +36,7 @@ This class represents a snap installation. --- - + ## class `WgetExecutable` The executable to be installed through wget. @@ -66,7 +66,7 @@ __init__(url: str, cmd: str) → None --- - + ## class `CreateRunnerConfig` The configuration values for creating a single runner instance. @@ -105,7 +105,7 @@ __init__( --- - + ## class `Runner` Single instance of GitHub self-hosted runner. @@ -120,7 +120,7 @@ Single instance of GitHub self-hosted runner. - `runner_script`: The runner start script file path. - `pre_job_script`: The runner pre_job script file path. This is referenced in the env_file in the ACTIONS_RUNNER_HOOK_JOB_STARTED environment variable. - + ### method `__init__` @@ -149,7 +149,7 @@ Construct the runner instance. --- - + ### method `create` @@ -173,7 +173,7 @@ Create the runner instance on LXD and register it on GitHub. --- - + ### method `pull_logs` @@ -193,7 +193,7 @@ Expects the runner to have an instance. --- - + ### method `remove` diff --git a/src-docs/runner_manager.md b/src-docs/runner_manager.md index 8d1773bf0..883745753 100644 --- a/src-docs/runner_manager.md +++ b/src-docs/runner_manager.md @@ -13,9 +13,9 @@ Runner Manager manages the runners on LXD and GitHub. --- - + -## class `RunnerManager` +## class `LXDRunnerManager` Manage a group of runners according to configuration. @@ -25,7 +25,7 @@ Manage a group of runners according to configuration. - `runner_bin_path`: The github runner app scripts path. - `cron_path`: The path to runner build image cron job. - + ### method `__init__` @@ -33,7 +33,7 @@ Manage a group of runners according to configuration. __init__( app_name: str, unit: int, - runner_manager_config: RunnerManagerConfig + runner_manager_config: LXDRunnerManagerConfig ) → None ``` @@ -52,7 +52,7 @@ Construct RunnerManager object for creating and managing runners. --- - + ### method `build_runner_image` @@ -72,7 +72,7 @@ Build container image in test mode, else virtual machine image. --- - + ### method `check_runner_bin` @@ -89,12 +89,12 @@ Check if runner binary exists. --- - + ### method `flush` ```python -flush(mode: FlushMode = ) → int +flush(mode: LXDFlushMode = ) → int ``` Remove existing runners. @@ -118,7 +118,7 @@ Remove existing runners. --- - + ### method `get_github_info` @@ -135,7 +135,7 @@ Get information on the runners from GitHub. --- - + ### method `get_latest_runner_bin_url` @@ -166,7 +166,7 @@ The runner binary URL changes when a new version is available. --- - + ### method `has_runner_image` @@ -183,7 +183,7 @@ Check if the runner image exists. --- - + ### method `reconcile` @@ -207,7 +207,7 @@ Bring runners in line with target. --- - + ### method `schedule_build_runner_image` @@ -219,7 +219,7 @@ Install cron job for building runner image. --- - + ### method `update_runner_bin` diff --git a/src-docs/runner_manager_type.md b/src-docs/runner_manager_type.md index f6dd4faae..cd7eaf5d2 100644 --- a/src-docs/runner_manager_type.md +++ b/src-docs/runner_manager_type.md @@ -11,7 +11,7 @@ Types used by RunnerManager class. -## class `FlushMode` +## class `LXDFlushMode` Strategy for flushing runners. During pre-job (repo-check), the runners are marked as idle and if the pre-job fails, the runner falls back to being idle again. Hence wait_repo_check is required. @@ -71,7 +71,7 @@ __init__( -## class `RunnerManagerConfig` +## class `LXDRunnerManagerConfig` Configuration of runner manager. @@ -97,7 +97,7 @@ __init__( charm_state: CharmState, image: str, lxd_storage_path: Path, - path: GithubOrg | GithubRepo, + path: GitHubOrg | GitHubRepo, service_token: str, token: str, dockerhub_mirror: str | None = None, @@ -147,7 +147,7 @@ Configuration of runner manager. ```python __init__( charm_state: CharmState, - path: GithubOrg | GithubRepo, + path: GitHubOrg | GitHubRepo, labels: Iterable[str], token: str, flavor: str, diff --git a/src-docs/runner_type.md b/src-docs/runner_type.md index 8c9db658a..cde5b2a7e 100644 --- a/src-docs/runner_type.md +++ b/src-docs/runner_type.md @@ -9,7 +9,7 @@ Types used by Runner class. --- - + ## class `RunnerNameByHealth` Set of runners instance by health state. @@ -39,7 +39,7 @@ __init__(healthy: tuple[str, ], unhealthy: tuple[str, ]) → None --- - + ## class `ProxySetting` Represent HTTP-related proxy settings. @@ -76,7 +76,7 @@ __init__( --- - + ## class `RunnerConfig` Configuration for runner. @@ -106,7 +106,7 @@ __init__( labels: tuple[str], lxd_storage_path: Path, name: str, - path: GithubOrg | GithubRepo, + path: GitHubOrg | GitHubRepo, proxies: ProxySetting, dockerhub_mirror: str | None = None, ssh_debug_connections: list[SSHDebugConnection] | None = None @@ -123,7 +123,7 @@ __init__( --- - + ## class `RunnerStatus` Status of runner. @@ -160,7 +160,7 @@ __init__( --- - + ## class `RunnerGithubInfo` GitHub info of a runner. diff --git a/src-docs/shared_fs.md b/src-docs/shared_fs.md index 004556f9f..5ae59a8ca 100644 --- a/src-docs/shared_fs.md +++ b/src-docs/shared_fs.md @@ -13,7 +13,7 @@ Classes and functions to operate on the shared filesystem between the charm and --- - + ## function `create` @@ -45,7 +45,7 @@ The method is not idempotent and will raise an exception if the shared filesyste --- - + ## function `list_all` @@ -63,7 +63,7 @@ List all the metric storages. --- - + ## function `get` @@ -95,7 +95,7 @@ Mounts the filesystem if it is not currently mounted. --- - + ## function `delete` diff --git a/src-docs/utilities.md b/src-docs/utilities.md index 6c2aab4e1..b2c4cbf21 100644 --- a/src-docs/utilities.md +++ b/src-docs/utilities.md @@ -8,77 +8,7 @@ Utilities used by the charm. --- - - -## function `retry` - -```python -retry( - exception: Type[Exception] = , - tries: int = 1, - delay: float = 0, - max_delay: Optional[float] = None, - backoff: float = 1, - local_logger: Logger = -) → Callable[[Callable[~ParamT, ~ReturnT]], Callable[~ParamT, ~ReturnT]] -``` - -Parameterize the decorator for adding retry to functions. - - - -**Args:** - - - `exception`: Exception type to be retried. - - `tries`: Number of attempts at retry. - - `delay`: Time in seconds to wait between retry. - - `max_delay`: Max time in seconds to wait between retry. - - `backoff`: Factor to increase the delay by each retry. - - `local_logger`: Logger for logging. - - - -**Returns:** - The function decorator for retry. - - ---- - - - -## function `secure_run_subprocess` - -```python -secure_run_subprocess( - cmd: Sequence[str], - hide_cmd: bool = False, - **kwargs: dict[str, Any] -) → CompletedProcess[bytes] -``` - -Run command in subprocess according to security recommendations. - -CalledProcessError will not be raised on error of the command executed. Errors should be handled by the caller by checking the exit code. - -The command is executed with `subprocess.run`, additional arguments can be passed to it as keyword arguments. The following arguments to `subprocess.run` should not be set: `capture_output`, `shell`, `check`. As those arguments are used by this function. - - - -**Args:** - - - `cmd`: Command in a list. - - `hide_cmd`: Hide logging of cmd. - - `kwargs`: Additional keyword arguments for the `subprocess.run` call. - - - -**Returns:** - Object representing the completed process. The outputs subprocess can accessed. - - ---- - - + ## function `execute_command` @@ -118,7 +48,7 @@ The output is logged if the log level of the logger is set to debug. --- - + ## function `get_env_var` @@ -144,29 +74,7 @@ Looks for all upper-case and all low-case of the `env_var`. --- - - -## function `set_env_var` - -```python -set_env_var(env_var: str, value: str) → None -``` - -Set the environment variable value. - -Set the all upper case and all low case of the `env_var`. - - - -**Args:** - - - `env_var`: Name of the environment variable. - - `value`: Value to set environment variable to. - - ---- - - + ## function `bytes_with_unit_to_kib` @@ -196,7 +104,7 @@ Convert a positive integer followed by a unit to number of kibibytes. --- - + ## function `remove_residual_venv_dirs` diff --git a/src/charm.py b/src/charm.py index a61586b9b..9dae00059 100755 --- a/src/charm.py +++ b/src/charm.py @@ -7,6 +7,22 @@ # pylint: disable=too-many-lines """Charm for creating and managing GitHub self-hosted runner instances.""" +from github_runner_manager.manager.cloud_runner_manager import ( + GitHubRunnerConfig, + SupportServiceConfig, +) +from github_runner_manager.manager.runner_manager import ( + FlushMode, + RunnerManager, + RunnerManagerConfig, +) +from github_runner_manager.manager.runner_scaler import RunnerScaler +from github_runner_manager.openstack_cloud.openstack_runner_manager import ( + OpenStackCloudConfig, + OpenStackRunnerManager, + OpenStackServerConfig, +) +from github_runner_manager.types_.github import GitHubPath, GitHubRunnerStatus, parse_github_path from utilities import bytes_with_unit_to_kib, execute_command, remove_residual_venv_dirs, retry @@ -56,20 +72,17 @@ TOKEN_CONFIG_NAME, CharmConfigInvalidError, CharmState, - GithubPath, InstanceType, OpenstackImage, ProxyConfig, RunnerStorage, VirtualMachineResources, - parse_github_path, ) from errors import ( ConfigurationError, LogrotateSetupError, MissingMongoDBError, MissingRunnerBinaryError, - OpenStackUnauthorizedError, RunnerBinaryError, RunnerError, SubprocessError, @@ -77,12 +90,14 @@ ) from event_timer import EventTimer, TimerStatusError from firewall import Firewall, FirewallEntry -from github_type import GitHubRunnerStatus -from openstack_cloud.openstack_manager import OpenstackRunnerManager from runner import LXD_PROFILE_YAML -from runner_manager import RunnerManager, RunnerManagerConfig -from runner_manager_type import FlushMode, OpenstackRunnerManagerConfig +from runner_manager import LXDRunnerManager, LXDRunnerManagerConfig +from runner_manager_type import LXDFlushMode +# We assume a stuck reconcile event when it takes longer +# than 10 times a normal interval. Currently, we are only aware of +# https://bugs.launchpad.net/juju/+bug/2055184 causing a stuck reconcile event. +RECONCILIATION_INTERVAL_TIMEOUT_FACTOR = 10 RECONCILE_RUNNERS_EVENT = "reconcile-runners" # This is currently hardcoded and may be moved to a config option in the future. @@ -133,11 +148,6 @@ def func_with_catch_errors(self: "GithubRunnerCharm", event: EventT) -> None: "GitHub runner application not downloaded; the charm will retry download on " "reconcile interval" ) - except OpenStackUnauthorizedError: - logger.exception("Unauthorized OpenStack connection") - self.unit.status = BlockedStatus( - "Unauthorized OpenStack connection. Check credentials." - ) except MissingMongoDBError as err: logger.exception("Missing integration data") self.unit.status = WaitingStatus(str(err)) @@ -246,6 +256,10 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self.on[DEBUG_SSH_INTEGRATION_NAME].relation_changed, self._on_debug_ssh_relation_changed, ) + self.framework.observe( + self.on[IMAGE_INTEGRATION_NAME].relation_joined, + self._on_image_relation_joined, + ) self.framework.observe( self.on[IMAGE_INTEGRATION_NAME].relation_changed, self._on_image_relation_changed, @@ -365,8 +379,8 @@ def _ensure_service_health(self) -> None: raise def _get_runner_manager( - self, state: CharmState, token: str | None = None, path: GithubPath | None = None - ) -> RunnerManager: + self, state: CharmState, token: str | None = None, path: GitHubPath | None = None + ) -> LXDRunnerManager: """Get a RunnerManager instance. Args: @@ -399,10 +413,10 @@ def _get_runner_manager( app_name, unit = self.unit.name.rsplit("/", 1) - return RunnerManager( + return LXDRunnerManager( app_name, unit, - RunnerManagerConfig( + LXDRunnerManagerConfig( charm_state=state, dockerhub_mirror=state.charm_config.dockerhub_mirror, image=state.runner_config.base_image.value, @@ -493,10 +507,11 @@ def _on_start(self, _: StartEvent) -> None: state = self._setup_state() if state.instance_type == InstanceType.OPENSTACK: + self.unit.status = MaintenanceStatus("Starting runners") if not self._get_set_image_ready_status(): return - openstack_runner_manager = self._get_openstack_runner_manager(state) - openstack_runner_manager.reconcile(state.runner_config.virtual_machines) + runner_scaler = self._get_runner_scaler(state) + runner_scaler.reconcile(state.runner_config.virtual_machines) self.unit.status = ActiveStatus() return @@ -508,7 +523,7 @@ def _on_start(self, _: StartEvent) -> None: self.unit.status = MaintenanceStatus("Starting runners") try: - runner_manager.flush(FlushMode.FLUSH_IDLE) + runner_manager.flush(LXDFlushMode.FLUSH_IDLE) self._reconcile_runners( runner_manager, state.runner_config.virtual_machines, @@ -544,7 +559,8 @@ def _set_reconcile_timer(self) -> None: self._event_timer.ensure_event_timer( event_name="reconcile-runners", interval=int(self.config[RECONCILE_INTERVAL_CONFIG_NAME]), - timeout=int(self.config[RECONCILE_INTERVAL_CONFIG_NAME]) - 1, + timeout=RECONCILIATION_INTERVAL_TIMEOUT_FACTOR + * int(self.config[RECONCILE_INTERVAL_CONFIG_NAME]), ) def _ensure_reconcile_timer_is_active(self) -> None: @@ -574,7 +590,7 @@ def _on_upgrade_charm(self, _: UpgradeCharmEvent) -> None: runner_manager = self._get_runner_manager(state) logger.info("Flushing the runners...") - runner_manager.flush(FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK) + runner_manager.flush(LXDFlushMode.FLUSH_BUSY_WAIT_REPO_CHECK) self._reconcile_runners( runner_manager, state.runner_config.virtual_machines, @@ -610,7 +626,7 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None: # noqa: C901 if prev_runner_manager: self.unit.status = MaintenanceStatus("Removing runners due to config change") # Flush runner in case the prev token has expired. - prev_runner_manager.flush(FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK) + prev_runner_manager.flush(LXDFlushMode.FORCE_FLUSH_WAIT_REPO_CHECK) state = self._setup_state() @@ -618,9 +634,9 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None: # noqa: C901 if not self._get_set_image_ready_status(): return if state.charm_config.token != self._stored.token: - openstack_runner_manager = self._get_openstack_runner_manager(state) - openstack_runner_manager.flush() - openstack_runner_manager.reconcile(state.runner_config.virtual_machines) + runner_scaler = self._get_runner_scaler(state) + runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) + runner_scaler.reconcile(state.runner_config.virtual_machines) # TODO: 2024-04-12: Flush on token changes. self.unit.status = ActiveStatus() return @@ -629,7 +645,7 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None: # noqa: C901 runner_manager = self._get_runner_manager(state) if state.charm_config.token != self._stored.token: - runner_manager.flush(FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK) + runner_manager.flush(LXDFlushMode.FORCE_FLUSH_WAIT_REPO_CHECK) self._stored.token = state.charm_config.token self._reconcile_runners( runner_manager, @@ -639,7 +655,7 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None: # noqa: C901 self.unit.status = ActiveStatus() def _check_and_update_local_lxd_dependencies( - self, runner_manager: RunnerManager, token: str, proxy_config: ProxyConfig + self, runner_manager: LXDRunnerManager, token: str, proxy_config: ProxyConfig ) -> bool: """Check and update runner binary and services for local LXD runners. @@ -690,7 +706,7 @@ def _check_and_update_local_lxd_dependencies( runner_bin_updated, ) self.unit.status = MaintenanceStatus("Flushing runners due to updated deps") - runner_manager.flush(FlushMode.FLUSH_IDLE_WAIT_REPO_CHECK) + runner_manager.flush(LXDFlushMode.FLUSH_IDLE_WAIT_REPO_CHECK) self._start_services(token, proxy_config) self.unit.status = ActiveStatus() @@ -719,8 +735,8 @@ def _trigger_reconciliation(self) -> None: if state.instance_type == InstanceType.OPENSTACK: if not self._get_set_image_ready_status(): return - runner_manager = self._get_openstack_runner_manager(state) - runner_manager.reconcile(state.runner_config.virtual_machines) + runner_scaler = self._get_runner_scaler(state) + runner_scaler.reconcile(state.runner_config.virtual_machines) self.unit.status = ActiveStatus() return @@ -757,21 +773,16 @@ def _on_check_runners_action(self, event: ActionEvent) -> None: state = self._setup_state() if state.instance_type == InstanceType.OPENSTACK: - openstack_runner_manager = self._get_openstack_runner_manager(state) - runner_info = openstack_runner_manager.get_github_runner_info() - - for info in runner_info: - if info.online: - online += 1 - runner_names.append(info.runner_name) - else: - offline += 1 + runner_scaler = self._get_runner_scaler(state) + info = runner_scaler.get_runner_info() event.set_results( { - "online": online, - "offline": offline, - "unknown": unknown, - "runners": ", ".join(runner_names), + "online": info.online, + "busy": info.busy, + "offline": info.offline, + "unknown": info.unknown, + "runners": info.runners, + "busy-runners": info.busy_runners, } ) return @@ -814,9 +825,9 @@ def _on_reconcile_runners_action(self, event: ActionEvent) -> None: if not self._get_set_image_ready_status(): event.fail("Openstack image not yet provided/ready.") return - runner_manager = self._get_openstack_runner_manager(state) + runner_scaler = self._get_runner_scaler(state) - delta = runner_manager.reconcile(state.runner_config.virtual_machines) + delta = runner_scaler.reconcile(state.runner_config.virtual_machines) self.unit.status = ActiveStatus() event.set_results({"delta": {"virtual-machines": delta}}) return @@ -847,22 +858,22 @@ def _on_flush_runners_action(self, event: ActionEvent) -> None: if state.instance_type == InstanceType.OPENSTACK: # Flushing mode not implemented for OpenStack yet. - runner_manager = self._get_openstack_runner_manager(state) - flushed = runner_manager.flush() - event.set_results({"delta": {"virtual-machines": flushed}}) + runner_scaler = self._get_runner_scaler(state) + flushed = runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) + logger.info("Flushed %s runners", flushed) + delta = runner_scaler.reconcile(state.runner_config.virtual_machines) + event.set_results({"delta": {"virtual-machines": delta}}) return runner_manager = self._get_runner_manager(state) - runner_manager.flush(FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK) + runner_manager.flush(LXDFlushMode.FLUSH_BUSY_WAIT_REPO_CHECK) delta = self._reconcile_runners( runner_manager, state.runner_config.virtual_machines, state.runner_config.virtual_machine_resources, ) - - self._on_check_runners_action(event) - event.set_results(delta) + event.set_results({"delta": {"virtual-machines": delta}}) @catch_action_errors def _on_update_dependencies_action(self, event: ActionEvent) -> None: @@ -895,15 +906,15 @@ def _on_stop(self, _: StopEvent) -> None: state = self._setup_state() if state.instance_type == InstanceType.OPENSTACK: - runner_manager = self._get_openstack_runner_manager(state) - runner_manager.flush() + runner_scaler = self._get_runner_scaler(state) + runner_scaler.flush() return runner_manager = self._get_runner_manager(state) - runner_manager.flush(FlushMode.FLUSH_BUSY) + runner_manager.flush(LXDFlushMode.FLUSH_BUSY) def _reconcile_runners( - self, runner_manager: RunnerManager, num: int, resources: VirtualMachineResources + self, runner_manager: LXDRunnerManager, num: int, resources: VirtualMachineResources ) -> Dict[str, Any]: """Reconcile the current runners state and intended runner state. @@ -918,7 +929,7 @@ def _reconcile_runners( Returns: Changes in runner number due to reconciling runners. """ - if not RunnerManager.runner_bin_path.is_file(): + if not LXDRunnerManager.runner_bin_path.is_file(): logger.warning("Unable to reconcile due to missing runner binary") raise MissingRunnerBinaryError("Runner binary not found.") @@ -1148,15 +1159,15 @@ def _on_debug_ssh_relation_changed(self, _: ops.RelationChangedEvent) -> None: if state.instance_type == InstanceType.OPENSTACK: if not self._get_set_image_ready_status(): return - runner_manager = self._get_openstack_runner_manager(state) + runner_scaler = self._get_runner_scaler(state) # TODO: 2024-04-12: Should be flush idle. - runner_manager.flush() - runner_manager.reconcile(state.runner_config.virtual_machines) + runner_scaler.flush() + runner_scaler.reconcile(state.runner_config.virtual_machines) return self._refresh_firewall(state) runner_manager = self._get_runner_manager(state) - runner_manager.flush(FlushMode.FLUSH_IDLE) + runner_manager.flush(LXDFlushMode.FLUSH_IDLE) self._reconcile_runners( runner_manager, state.runner_config.virtual_machines, @@ -1178,12 +1189,13 @@ def _on_image_relation_joined(self, _: ops.RelationJoinedEvent) -> None: cloud = list(clouds_yaml["clouds"].keys())[0] auth_map = clouds_yaml["clouds"][cloud]["auth"] for relation in self.model.relations[IMAGE_INTEGRATION_NAME]: - relation.data[self.model.unit].update(auth_map) + relation.data[self.unit].update(auth_map) @catch_charm_errors def _on_image_relation_changed(self, _: ops.RelationChangedEvent) -> None: """Handle image relation changed event.""" state = self._setup_state() + self.unit.status = MaintenanceStatus("Update image for runners") if state.instance_type != InstanceType.OPENSTACK: self.unit.status = BlockedStatus( @@ -1193,10 +1205,9 @@ def _on_image_relation_changed(self, _: ops.RelationChangedEvent) -> None: if not self._get_set_image_ready_status(): return - runner_manager = self._get_openstack_runner_manager(state) - # TODO: 2024-04-12: Should be flush idle. - runner_manager.flush() - runner_manager.reconcile(state.runner_config.virtual_machines) + runner_scaler = self._get_runner_scaler(state) + runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) + runner_scaler.reconcile(state.runner_config.virtual_machines) self.unit.status = ActiveStatus() return @@ -1215,13 +1226,10 @@ def _get_set_image_ready_status(self) -> bool: return False return True - def _get_openstack_runner_manager( - self, state: CharmState, token: str | None = None, path: GithubPath | None = None - ) -> OpenstackRunnerManager: - """Get OpenstackRunnerManager instance. - - TODO: 2024-07-09 Combine this with `_get_runner_manager` during the runner manager \ - interface refactor. + def _get_runner_scaler( + self, state: CharmState, token: str | None = None, path: GitHubPath | None = None + ) -> RunnerScaler: + """Get runner scaler instance for scaling runners. Args: state: Charm state. @@ -1231,39 +1239,62 @@ def _get_openstack_runner_manager( name. If None the path in charm state is used. Returns: - An instance of OpenstackRunnerManager. + An instance of RunnerScaler. """ if token is None: token = state.charm_config.token if path is None: path = state.charm_config.path - # Empty image can be passed down due to a delete only case where deletion of runners do not - # depend on the image ID being available. Make sure that the charm goes to blocked status - # in hook where a runner may be created. TODO: 2024-07-09 This logic is subject to - # refactoring. + clouds = list(state.charm_config.openstack_clouds_yaml["clouds"].keys()) + if len(clouds) > 1: + logger.warning( + "Multiple clouds defined in clouds.yaml. Using the first one to connect." + ) + cloud_config = OpenStackCloudConfig( + clouds_config=state.charm_config.openstack_clouds_yaml, + cloud=clouds[0], + ) + server_config = None + image_labels = [] image = state.runner_config.openstack_image - image_id = image.id if image and image.id else "" - image_labels = image.tags if image and image.tags else [] + if image and image.id: + server_config = OpenStackServerConfig( + image=image.id, + flavor=state.runner_config.openstack_flavor, + network=state.runner_config.openstack_network, + ) + if image.tags: + image_labels += image.tags - app_name, unit = self.unit.name.rsplit("/", 1) - openstack_runner_manager_config = OpenstackRunnerManagerConfig( - charm_state=state, - path=path, - token=token, - labels=(*state.charm_config.labels, *image_labels), - flavor=state.runner_config.openstack_flavor, - image=image_id, - network=state.runner_config.openstack_network, + runner_config = GitHubRunnerConfig( + github_path=path, labels=(*state.charm_config.labels, *image_labels) + ) + service_config = SupportServiceConfig( + proxy_config=state.proxy_config, dockerhub_mirror=state.charm_config.dockerhub_mirror, - reactive_config=state.reactive_config, + ssh_debug_connections=state.ssh_debug_connections, + repo_policy_compliance=state.charm_config.repo_policy_compliance, ) - return OpenstackRunnerManager( - app_name, - unit, - openstack_runner_manager_config, - state.charm_config.openstack_clouds_yaml, + # The prefix is set to f"{application_name}-{unit number}" + openstack_runner_manager = OpenStackRunnerManager( + manager_name=self.app.name, + prefix=self.unit.name.replace("/", "-"), + cloud_config=cloud_config, + server_config=server_config, + runner_config=runner_config, + service_config=service_config, + ) + runner_manager_config = RunnerManagerConfig( + token=token, + path=path, + ) + runner_manager = RunnerManager( + manager_name=self.app.name, + cloud_runner_manager=openstack_runner_manager, + config=runner_manager_config, ) + return RunnerScaler(runner_manager=runner_manager, reactive_config=state.reactive_config) if __name__ == "__main__": diff --git a/src/charm_state.py b/src/charm_state.py index 492f4b21e..6ae46d386 100644 --- a/src/charm_state.py +++ b/src/charm_state.py @@ -19,6 +19,9 @@ import yaml from charms.data_platform_libs.v0.data_interfaces import DatabaseRequires +from github_runner_manager import openstack_cloud +from github_runner_manager.errors import OpenStackInvalidConfigError +from github_runner_manager.types_.github import GitHubPath, parse_github_path from ops import CharmBase from pydantic import ( AnyHttpUrl, @@ -31,8 +34,7 @@ validator, ) -import openstack_cloud -from errors import MissingMongoDBError, OpenStackInvalidConfigError +from errors import MissingMongoDBError from firewall import FirewallEntry from utilities import get_env_var @@ -87,75 +89,6 @@ class AnyHttpsUrl(AnyHttpUrl): allowed_schemes = {"https"} -@dataclasses.dataclass -class GithubRepo: - """Represent GitHub repository. - - Attributes: - owner: Owner of the GitHub repository. - repo: Name of the GitHub repository. - """ - - owner: str - repo: str - - def path(self) -> str: - """Return a string representing the path. - - Returns: - Path to the GitHub entity. - """ - return f"{self.owner}/{self.repo}" - - -@dataclasses.dataclass -class GithubOrg: - """Represent GitHub organization. - - Attributes: - org: Name of the GitHub organization. - group: Runner group to spawn the runners in. - """ - - org: str - group: str - - def path(self) -> str: - """Return a string representing the path. - - Returns: - Path to the GitHub entity. - """ - return self.org - - -GithubPath = GithubOrg | GithubRepo - - -def parse_github_path(path_str: str, runner_group: str) -> GithubPath: - """Parse GitHub path. - - Args: - path_str: GitHub path in string format. - runner_group: Runner group name for GitHub organization. If the path is - a repository this argument is ignored. - - Raises: - CharmConfigInvalidError: if an invalid path string was given. - - Returns: - GithubPath object representing the GitHub repository, or the GitHub - organization with runner group information. - """ - if "/" in path_str: - paths = tuple(segment for segment in path_str.split("/") if segment) - if len(paths) != 2: - raise CharmConfigInvalidError(f"Invalid path configuration {path_str}") - owner, repo = paths - return GithubRepo(owner=owner, repo=repo) - return GithubOrg(org=path_str, group=runner_group) - - @dataclasses.dataclass class GithubConfig: """Charm configuration related to GitHub. @@ -166,7 +99,7 @@ class GithubConfig: """ token: str - path: GithubPath + path: GitHubPath @classmethod def from_charm(cls, charm: CharmBase) -> "GithubConfig": @@ -186,7 +119,10 @@ def from_charm(cls, charm: CharmBase) -> "GithubConfig": path_str = cast(str, charm.config.get(PATH_CONFIG_NAME, "")) if not path_str: raise CharmConfigInvalidError(f"Missing {PATH_CONFIG_NAME} configuration") - path = parse_github_path(cast(str, path_str), cast(str, runner_group)) + try: + path = parse_github_path(cast(str, path_str), cast(str, runner_group)) + except ValueError as e: + raise CharmConfigInvalidError(str(e)) from e token = cast(str, charm.config.get(TOKEN_CONFIG_NAME)) if not token: @@ -410,7 +346,7 @@ class CharmConfig(BaseModel): dockerhub_mirror: AnyHttpsUrl | None labels: tuple[str, ...] openstack_clouds_yaml: OpenStackCloudsYAML | None - path: GithubPath + path: GitHubPath reconcile_interval: int repo_policy_compliance: RepoPolicyComplianceConfig | None token: str diff --git a/src/errors.py b/src/errors.py index 59d28a239..7212b4642 100644 --- a/src/errors.py +++ b/src/errors.py @@ -6,31 +6,31 @@ from typing import Union +# we import the errors from the module, these are used in the charm +from github_runner_manager.errors import ( # noqa: F401 pylint: disable=unused-import + CreateMetricsStorageError, + DeleteMetricsStorageError, + GetMetricsStorageError, + GithubClientError, + GithubMetricsError, + MetricsStorageError, + RunnerError, + TokenError, +) -class RunnerError(Exception): - """Generic runner error as base exception.""" - -class RunnerExecutionError(RunnerError): - """Error for executing commands on runner.""" +class RunnerCreateError(RunnerError): + """Error for runner creation failure.""" class RunnerFileLoadError(RunnerError): """Error for loading file on runner.""" -class RunnerCreateError(RunnerError): - """Error for runner creation failure.""" - - class RunnerRemoveError(RunnerError): """Error for runner removal failure.""" -class RunnerStartError(RunnerError): - """Error for runner start failure.""" - - class RunnerBinaryError(RunnerError): """Error of getting runner binary.""" @@ -39,6 +39,10 @@ class RunnerAproxyError(RunnerError): """Error for setting up aproxy.""" +class MissingServerConfigError(RunnerError): + """Error for unable to create runner due to missing server configurations.""" + + class MissingRunnerBinaryError(Exception): """Error for missing runner binary.""" @@ -96,81 +100,13 @@ class LogrotateSetupError(Exception): """Represents an error raised when logrotate cannot be setup.""" -class MetricsStorageError(Exception): - """Base class for all metrics storage errors.""" - - class SharedFilesystemError(MetricsStorageError): """Base class for all shared filesystem errors.""" -class CreateMetricsStorageError(MetricsStorageError): - """Represents an error when the metrics storage could not be created.""" - - -class DeleteMetricsStorageError(MetricsStorageError): - """Represents an error when the metrics storage could not be deleted.""" - - -class GetMetricsStorageError(MetricsStorageError): - """Represents an error when the metrics storage could not be retrieved.""" - - -class QuarantineMetricsStorageError(MetricsStorageError): - """Represents an error when the metrics storage could not be quarantined.""" - - class SharedFilesystemMountError(SharedFilesystemError): """Represents an error related to the mounting of the shared filesystem.""" -class RunnerMetricsError(Exception): - """Base class for all runner metrics errors.""" - - -class CorruptMetricDataError(RunnerMetricsError): - """Represents an error with the data being corrupt.""" - - -class GithubMetricsError(Exception): - """Base class for all github metrics errors.""" - - -class GithubClientError(Exception): - """Base class for all github client errors.""" - - -class GithubApiError(GithubClientError): - """Represents an error when the GitHub API returns an error.""" - - -class TokenError(GithubClientError): - """Represents an error when the token is invalid or has not enough permissions.""" - - -class JobNotFoundError(GithubClientError): - """Represents an error when the job could not be found on GitHub.""" - - class RunnerLogsError(Exception): """Base class for all runner logs errors.""" - - -class OpenStackError(Exception): - """Base class for OpenStack errors.""" - - -class OpenStackInvalidConfigError(OpenStackError): - """Represents an invalid OpenStack configuration.""" - - -class OpenStackUnauthorizedError(OpenStackError): - """Represents an unauthorized connection to OpenStack.""" - - -class SSHError(Exception): - """Represents an error while interacting with SSH.""" - - -class KeyfileError(SSHError): - """Represents missing keyfile for SSH.""" diff --git a/src/event_timer.py b/src/event_timer.py index da03150ff..ab94c5736 100644 --- a/src/event_timer.py +++ b/src/event_timer.py @@ -5,7 +5,7 @@ import logging import subprocess # nosec B404 from pathlib import Path -from typing import Optional, TypedDict +from typing import TypedDict import jinja2 @@ -107,9 +107,7 @@ def is_active(self, event_name: str) -> bool: return ret_code == 0 - def ensure_event_timer( - self, event_name: str, interval: int, timeout: Optional[int] = None - ) -> None: + def ensure_event_timer(self, event_name: str, interval: int, timeout: int) -> None: """Ensure that a systemd service and timer are registered to dispatch the given event. The interval is how frequently, in minutes, the event should be dispatched. @@ -125,10 +123,7 @@ def ensure_event_timer( Raises: TimerEnableError: Timer cannot be started. Events will be not emitted. """ - if timeout is not None: - timeout_in_secs = timeout * 60 - else: - timeout_in_secs = interval * 30 + timeout_in_secs = timeout * 60 context: EventConfig = { "event": event_name, diff --git a/src/github_client.py b/src/github_client.py index 3c7718f94..b14d3b799 100644 --- a/src/github_client.py +++ b/src/github_client.py @@ -6,27 +6,22 @@ Migrate to PyGithub in the future. PyGithub is still lacking some API such as remove token for runner. """ -import functools import logging -from datetime import datetime -from typing import Callable, ParamSpec, TypeVar -from urllib.error import HTTPError - -from ghapi.all import GhApi, pages -from ghapi.page import paged -from typing_extensions import assert_never - -from charm_state import Arch, GithubOrg, GithubPath, GithubRepo -from errors import GithubApiError, JobNotFoundError, RunnerBinaryError, TokenError -from github_type import ( - JobStats, - RegistrationToken, - RemoveToken, +from typing import ParamSpec, TypeVar + +from github_runner_manager.github_client import GithubClient as GitHubRunnerManagerGitHubClient +from github_runner_manager.github_client import catch_http_errors +from github_runner_manager.types_.github import ( + GitHubOrg, + GitHubPath, + GitHubRepo, RunnerApplication, RunnerApplicationList, - SelfHostedRunner, ) +from charm_state import Arch +from errors import RunnerBinaryError + logger = logging.getLogger(__name__) # Parameters of the function decorated with retry @@ -35,60 +30,12 @@ ReturnT = TypeVar("ReturnT") -def catch_http_errors(func: Callable[ParamT, ReturnT]) -> Callable[ParamT, ReturnT]: - """Catch HTTP errors and raise custom exceptions. - - Args: - func: The target function to catch common errors for. - - Returns: - The decorated function. - """ - - @functools.wraps(func) - def wrapper(*args: ParamT.args, **kwargs: ParamT.kwargs) -> ReturnT: - """Catch common errors when using the GitHub API. - - Args: - args: Placeholder for positional arguments. - kwargs: Placeholder for keyword arguments. - - Raises: - TokenError: If there was an error with the provided token. - GithubApiError: If there was an unexpected error using the GitHub API. - - Returns: - The decorated function. - """ - try: - return func(*args, **kwargs) - except HTTPError as exc: - if exc.code in (401, 403): - if exc.code == 401: - msg = "Invalid token." - else: - msg = "Provided token has not enough permissions or has reached rate-limit." - raise TokenError(msg) from exc - raise GithubApiError from exc - - return wrapper - - -class GithubClient: +class GithubClient(GitHubRunnerManagerGitHubClient): """GitHub API client.""" - def __init__(self, token: str): - """Instantiate the GiHub API client. - - Args: - token: GitHub personal token for API requests. - """ - self._token = token - self._client = GhApi(token=self._token) - @catch_http_errors def get_runner_application( - self, path: GithubPath, arch: Arch, os: str = "linux" + self, path: GitHubPath, arch: Arch, os: str = "linux" ) -> RunnerApplication: """Get runner application available for download for given arch. @@ -106,11 +53,11 @@ def get_runner_application( The runner application. """ runner_applications: RunnerApplicationList = [] - if isinstance(path, GithubRepo): + if isinstance(path, GitHubRepo): runner_applications = self._client.actions.list_runner_applications_for_repo( owner=path.owner, repo=path.repo ) - if isinstance(path, GithubOrg): + if isinstance(path, GitHubOrg): runner_applications = self._client.actions.list_runner_applications_for_org( org=path.org ) @@ -125,176 +72,3 @@ def get_runner_application( raise RunnerBinaryError( f"Unable query GitHub runner binary information for {os} {arch}" ) from err - - @catch_http_errors - def get_runner_github_info(self, path: GithubPath) -> list[SelfHostedRunner]: - """Get runner information on GitHub under a repo or org. - - Args: - path: GitHub repository path in the format '/', or the GitHub organization - name. - - Returns: - List of runner information. - """ - remote_runners_list: list[SelfHostedRunner] = [] - - if isinstance(path, GithubRepo): - # The documentation of ghapi for pagination is incorrect and examples will give errors. - # This workaround is a temp solution. Will be moving to PyGitHub in the future. - self._client.actions.list_self_hosted_runners_for_repo( - owner=path.owner, repo=path.repo, per_page=100 - ) - num_of_pages = self._client.last_page() - remote_runners_list = [ - item - for page in pages( - self._client.actions.list_self_hosted_runners_for_repo, - num_of_pages + 1, - owner=path.owner, - repo=path.repo, - per_page=100, - ) - for item in page["runners"] - ] - if isinstance(path, GithubOrg): - # The documentation of ghapi for pagination is incorrect and examples will give errors. - # This workaround is a temp solution. Will be moving to PyGitHub in the future. - self._client.actions.list_self_hosted_runners_for_org(org=path.org, per_page=100) - num_of_pages = self._client.last_page() - remote_runners_list = [ - item - for page in pages( - self._client.actions.list_self_hosted_runners_for_org, - num_of_pages + 1, - org=path.org, - per_page=100, - ) - for item in page["runners"] - ] - return remote_runners_list - - @catch_http_errors - def get_runner_remove_token(self, path: GithubPath) -> str: - """Get token from GitHub used for removing runners. - - Args: - path: The Github org/repo path. - - Returns: - The removing token. - """ - token: RemoveToken - if isinstance(path, GithubRepo): - token = self._client.actions.create_remove_token_for_repo( - owner=path.owner, repo=path.repo - ) - elif isinstance(path, GithubOrg): - token = self._client.actions.create_remove_token_for_org(org=path.org) - else: - assert_never(token) - - return token["token"] - - @catch_http_errors - def get_runner_registration_token(self, path: GithubPath) -> str: - """Get token from GitHub used for registering runners. - - Args: - path: GitHub repository path in the format '/', or the GitHub organization - name. - - Returns: - The registration token. - """ - token: RegistrationToken - if isinstance(path, GithubRepo): - token = self._client.actions.create_registration_token_for_repo( - owner=path.owner, repo=path.repo - ) - elif isinstance(path, GithubOrg): - token = self._client.actions.create_registration_token_for_org(org=path.org) - else: - assert_never(token) - - return token["token"] - - @catch_http_errors - def delete_runner(self, path: GithubPath, runner_id: int) -> None: - """Delete the self-hosted runner from GitHub. - - Args: - path: GitHub repository path in the format '/', or the GitHub organization - name. - runner_id: Id of the runner. - """ - if isinstance(path, GithubRepo): - self._client.actions.delete_self_hosted_runner_from_repo( - owner=path.owner, - repo=path.repo, - runner_id=runner_id, - ) - if isinstance(path, GithubOrg): - self._client.actions.delete_self_hosted_runner_from_org( - org=path.org, - runner_id=runner_id, - ) - - def get_job_info(self, path: GithubRepo, workflow_run_id: str, runner_name: str) -> JobStats: - """Get information about a job for a specific workflow run. - - Args: - path: GitHub repository path in the format '/'. - workflow_run_id: Id of the workflow run. - runner_name: Name of the runner. - - Raises: - TokenError: if there was an error with the Github token crdential provided. - JobNotFoundError: If no jobs were found. - - Returns: - Job information. - """ - paged_kwargs = {"owner": path.owner, "repo": path.repo, "run_id": workflow_run_id} - try: - for wf_run_page in paged( - self._client.actions.list_jobs_for_workflow_run, **paged_kwargs - ): - jobs = wf_run_page["jobs"] - # ghapi performs endless pagination, - # so we have to break out of the loop if there are no more jobs - if not jobs: - break - for job in jobs: - if job["runner_name"] == runner_name: - # datetime strings should be in ISO 8601 format, - # but they can also use Z instead of - # +00:00, which is not supported by datetime.fromisoformat - created_at = datetime.fromisoformat( - job["created_at"].replace("Z", "+00:00") - ) - started_at = datetime.fromisoformat( - job["started_at"].replace("Z", "+00:00") - ) - # conclusion could be null per api schema, so we need to handle that - # though we would assume that it should always be present, - # as the job should be finished - conclusion = job.get("conclusion", None) - - job_id = job["id"] - return JobStats( - job_id=job_id, - created_at=created_at, - started_at=started_at, - conclusion=conclusion, - ) - - except HTTPError as exc: - if exc.code in (401, 403): - raise TokenError from exc - raise JobNotFoundError( - f"Could not find job for runner {runner_name}. " - f"Could not list jobs for workflow run {workflow_run_id}" - ) from exc - - raise JobNotFoundError(f"Could not find job for runner {runner_name}.") diff --git a/src/github_type.py b/src/github_type.py deleted file mode 100644 index a26a0279a..000000000 --- a/src/github_type.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Return type for the GitHub web API.""" - - -from __future__ import annotations - -from datetime import datetime -from enum import Enum -from typing import List, Literal, Optional, TypedDict - -from pydantic import BaseModel -from typing_extensions import NotRequired - - -class GitHubRunnerStatus(str, Enum): - """Status of runner on GitHub. - - Attributes: - ONLINE: Represents an online runner status. - OFFLINE: Represents an offline runner status. - """ - - ONLINE = "online" - OFFLINE = "offline" - - -# See response schema for -# https://docs.github.com/en/rest/actions/self-hosted-runners?apiVersion=2022-11-28#list-runner-applications-for-an-organization -class RunnerApplication(TypedDict, total=False): - """Information on the runner application. - - Attributes: - os: Operating system to run the runner application on. - architecture: Computer Architecture to run the runner application on. - download_url: URL to download the runner application. - filename: Filename of the runner application. - temp_download_token: A short lived bearer token used to download the - runner, if needed. - sha256_checksum: SHA256 Checksum of the runner application. - """ - - os: Literal["linux", "win", "osx"] - architecture: Literal["arm", "arm64", "x64"] - download_url: str - filename: str - temp_download_token: NotRequired[str] - sha256_checksum: NotRequired[str] - - -RunnerApplicationList = List[RunnerApplication] - - -class SelfHostedRunnerLabel(TypedDict, total=False): - """A single label of self-hosted runners. - - Attributes: - id: Unique identifier of the label. - name: Name of the label. - type: Type of label. Read-only labels are applied automatically when - the runner is configured. - """ - - id: NotRequired[int] - name: str - type: NotRequired[str] - - -class SelfHostedRunner(TypedDict): - """Information on a single self-hosted runner. - - Attributes: - busy: Whether the runner is executing a job. - id: Unique identifier of the runner. - labels: Labels of the runner. - os: Operation system of the runner. - name: Name of the runner. - status: The Github runner status. - """ - - busy: bool - id: int - labels: list[SelfHostedRunnerLabel] - os: str - name: str - status: GitHubRunnerStatus - - -class SelfHostedRunnerList(TypedDict): - """Information on a collection of self-hosted runners. - - Attributes: - total_count: Total number of runners. - runners: List of runners. - """ - - total_count: int - runners: list[SelfHostedRunner] - - -class RegistrationToken(TypedDict): - """Token used for registering GitHub runners. - - Attributes: - token: Token for registering GitHub runners. - expires_at: Time the token expires at. - """ - - token: str - expires_at: str - - -class RemoveToken(TypedDict): - """Token used for removing GitHub runners. - - Attributes: - token: Token for removing GitHub runners. - expires_at: Time the token expires at. - """ - - token: str - expires_at: str - - -class JobConclusion(str, Enum): - """Conclusion of a job on GitHub. - - See :https://docs.github.com/en/rest/actions/workflow-runs?apiVersion=2022-11-28\ -#list-workflow-runs-for-a-repository - - Attributes: - ACTION_REQUIRED: Represents additional action required on the job. - CANCELLED: Represents a cancelled job status. - FAILURE: Represents a failed job status. - NEUTRAL: Represents a job status that can optionally succeed or fail. - SKIPPED: Represents a skipped job status. - SUCCESS: Represents a successful job status. - TIMED_OUT: Represents a job that has timed out. - """ - - ACTION_REQUIRED = "action_required" - CANCELLED = "cancelled" - FAILURE = "failure" - NEUTRAL = "neutral" - SKIPPED = "skipped" - SUCCESS = "success" - TIMED_OUT = "timed_out" - - -class JobStats(BaseModel): - """Stats for a job on GitHub. - - Attributes: - job_id: The ID of the job. - created_at: The time the job was created. - started_at: The time the job was started. - conclusion: The end result of a job. - """ - - job_id: int - created_at: datetime - started_at: datetime - conclusion: Optional[JobConclusion] diff --git a/src/grafana_dashboards/metrics.json b/src/grafana_dashboards/metrics.json index 8582b77ef..0e8728765 100644 --- a/src/grafana_dashboards/metrics.json +++ b/src/grafana_dashboards/metrics.json @@ -24,10 +24,23 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 537, + "id": 567, "links": [], "liveNow": false, "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 10, + "panels": [], + "title": "General", + "type": "row" + }, { "datasource": { "type": "loki", @@ -91,7 +104,7 @@ "h": 8, "w": 12, "x": 0, - "y": 0 + "y": 1 }, "id": 3, "options": { @@ -212,7 +225,7 @@ "h": 8, "w": 12, "x": 12, - "y": 0 + "y": 1 }, "id": 22, "options": { @@ -253,19 +266,6 @@ "title": "Available Runners", "type": "barchart" }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 8 - }, - "id": 10, - "panels": [], - "title": "General", - "type": "row" - }, { "datasource": { "type": "loki", @@ -485,7 +485,7 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\", duration=\"queue_duration\", flavor=\"flavor\" | __error__=`` | event = `runner_start` | flavor =~ `$flavor` | unwrap duration [1h]) by (flavor)", + "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"queue_duration\",flavor=\"flavor\" | __error__=\"\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(flavor)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "{{flavor}}", @@ -493,7 +493,7 @@ "refId": "E" } ], - "title": "Max Job Queue Duration By Flavour", + "title": "Max Job Queue Duration By Application", "type": "timeseries" }, { @@ -789,10 +789,10 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, + "x": 0, "y": 25 }, - "id": 4, + "id": 24, "options": { "legend": { "calcs": [], @@ -812,7 +812,7 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.5,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", + "expr": "quantile_over_time(0.5,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "50%", "queryType": "range", @@ -824,12 +824,12 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.95,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", + "expr": "quantile_over_time(0.95,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "95%", "queryType": "range", - "refId": "C" + "refId": "D" }, { "datasource": { @@ -837,30 +837,151 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.99,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", + "expr": "quantile_over_time(0.99,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "99%", "queryType": "range", - "refId": "D" + "refId": "E" }, { "datasource": { "type": "loki", "uid": "${lokids}" }, - "editorMode": "code", - "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", + "editorMode": "builder", + "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "Max", "queryType": "range", - "refId": "E" + "refId": "C" } ], - "title": "Runner Idle Duration (Percentile)", + "title": "Reconciliation Duration (Percentile)", "type": "timeseries" }, + { + "datasource": { + "type": "loki", + "uid": "${lokids}" + }, + "description": "The average actual reconciliation interval.\n\nUnlike the Reconciliation Duration panel, this panel shows the time between the actual start of reconciliations. This is useful for identifying problems where the reconciliation itself is behaving normally, but the event itself is not being scheduled as expected.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value #A" + }, + "properties": [ + { + "id": "displayName", + "value": "Duration" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 6, + "options": { + "barRadius": 0, + "barWidth": 0.97, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": false + }, + "orientation": "horizontal", + "showValue": "never", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "${lokids}" + }, + "editorMode": "code", + "expr": "1 / sum by (flavor) (rate({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\", flavor=\"flavor\" | event=\"reconciliation\" [$__range]))", + "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", + "legendFormat": "", + "queryType": "instant", + "refId": "A" + } + ], + "title": "Average Reconciliation Interval", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "desc": true, + "field": "Value #A" + } + ] + } + } + ], + "type": "barchart" + }, { "datasource": { "type": "loki", @@ -926,7 +1047,7 @@ "x": 12, "y": 33 }, - "id": 6, + "id": 4, "options": { "legend": { "calcs": [], @@ -946,7 +1067,7 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.5,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", + "expr": "quantile_over_time(0.5,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "50%", "queryType": "range", @@ -958,12 +1079,12 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.95,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", + "expr": "quantile_over_time(0.95,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "95%", "queryType": "range", - "refId": "D" + "refId": "C" }, { "datasource": { @@ -971,28 +1092,28 @@ "uid": "${lokids}" }, "editorMode": "builder", - "expr": "quantile_over_time(0.99,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", + "expr": "quantile_over_time(0.99,{filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "99%", "queryType": "range", - "refId": "E" + "refId": "D" }, { "datasource": { "type": "loki", "uid": "${lokids}" }, - "editorMode": "builder", - "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",duration=\"duration\",flavor=\"flavor\" | event=\"reconciliation\" | flavor=~\"$flavor\" | unwrap duration[1h]) by(filename)", + "editorMode": "code", + "expr": "max_over_time({filename=\"/var/log/github-runner-metrics.log\", juju_application=~\"$juju_application\", juju_model=~\"$juju_model\", juju_model_uuid=~\"$juju_model_uuid\", juju_unit=~\"$juju_unit\"} | json event=\"event\",idle=\"idle\",flavor=\"flavor\" | event=\"runner_start\" | flavor=~\"$flavor\" | unwrap idle[1h]) by(filename)", "hide": false, "key": "Q-9302bc4d-cce0-4674-bad5-353257fdd2f4-0", "legendFormat": "Max", "queryType": "range", - "refId": "C" + "refId": "E" } ], - "title": "Reconciliation Duration (Percentile)", + "title": "Runner Idle Duration (Percentile)", "type": "timeseries" }, { @@ -1924,6 +2045,6 @@ "timepicker": {}, "timezone": "", "title": "GitHub Self-Hosted Runner Metrics", - "version": 14, + "version": 15, "weekStart": "" } diff --git a/src/logrotate.py b/src/logrotate.py index 0fd65d5af..294c651dd 100644 --- a/src/logrotate.py +++ b/src/logrotate.py @@ -6,11 +6,11 @@ from pathlib import Path from charms.operator_libs_linux.v1 import systemd +from github_runner_manager.metrics.events import METRICS_LOG_PATH +from github_runner_manager.reactive.runner_manager import REACTIVE_RUNNER_LOG_DIR from pydantic import BaseModel from errors import LogrotateSetupError -from metrics.events import METRICS_LOG_PATH -from reactive.runner_manager import REACTIVE_RUNNER_LOG_DIR LOG_ROTATE_TIMER_SYSTEMD_SERVICE = "logrotate.timer" diff --git a/src/manager/cloud_runner_manager.py b/src/manager/cloud_runner_manager.py deleted file mode 100644 index 28ed17b20..000000000 --- a/src/manager/cloud_runner_manager.py +++ /dev/null @@ -1,204 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Interface of manager of runner instance on clouds.""" - -import abc -import logging -from dataclasses import dataclass -from enum import Enum, auto -from typing import Iterator, Sequence, Tuple - -from charm_state import GithubPath, ProxyConfig, SSHDebugConnection -from metrics.runner import RunnerMetrics - -logger = logging.getLogger(__name__) - -InstanceId = str - - -class HealthState(Enum): - """Health state of the runners. - - Attributes: - HEALTHY: The runner is healthy. - UNHEALTHY: The runner is not healthy. - UNKNOWN: Unable to get the health state. - """ - - HEALTHY = auto() - UNHEALTHY = auto() - UNKNOWN = auto() - - -class CloudRunnerState(str, Enum): - """Represent state of the instance hosting the runner. - - Attributes: - CREATED: The instance is created. - ACTIVE: The instance is active and running. - DELETED: The instance is deleted. - ERROR: The instance has encountered error and not running. - STOPPED: The instance has stopped. - UNKNOWN: The state of the instance is not known. - UNEXPECTED: An unknown state not accounted by the developer is encountered. - """ - - CREATED = auto() - ACTIVE = auto() - DELETED = auto() - ERROR = auto() - STOPPED = auto() - UNKNOWN = auto() - UNEXPECTED = auto() - - @staticmethod - def from_openstack_server_status( - openstack_server_status: str, - ) -> "CloudRunnerState": - """Create from openstack server status. - - The openstack server status are documented here: - https://docs.openstack.org/api-guide/compute/server_concepts.html - - Args: - openstack_server_status: Openstack server status. - - Returns: - The state of the runner. - """ - state = CloudRunnerState.UNEXPECTED - match openstack_server_status: - case "BUILD": - state = CloudRunnerState.CREATED - case "REBUILD": - state = CloudRunnerState.CREATED - case "ACTIVE": - state = CloudRunnerState.ACTIVE - case "ERROR": - state = CloudRunnerState.ERROR - case "STOPPED": - state = CloudRunnerState.STOPPED - case "DELETED": - state = CloudRunnerState.DELETED - case "UNKNOWN": - state = CloudRunnerState.UNKNOWN - case _: - state = CloudRunnerState.UNEXPECTED - return state - - -@dataclass -class GitHubRunnerConfig: - """Configuration for GitHub runner spawned. - - Attributes: - github_path: The GitHub organization or repository for runners to connect to. - labels: The labels to add to runners. - """ - - github_path: GithubPath - labels: list[str] - - -@dataclass -class SupportServiceConfig: - """Configuration for supporting services for runners. - - Attributes: - proxy_config: The proxy configuration. - dockerhub_mirror: The dockerhub mirror to use for runners. - ssh_debug_connections: The information on the ssh debug services. - repo_policy_url: The URL of the repo policy service. - repo_policy_token: The token to access the repo policy service. - """ - - proxy_config: ProxyConfig | None - dockerhub_mirror: str | None - ssh_debug_connections: list[SSHDebugConnection] | None - repo_policy_url: str | None - repo_policy_token: str | None - - -@dataclass -class CloudRunnerInstance: - """Information on the runner on the cloud. - - Attributes: - name: Name of the instance hosting the runner. - instance_id: ID of the instance. - health: Health state of the runner. - state: State of the instance hosting the runner. - """ - - name: str - instance_id: InstanceId - health: HealthState - state: CloudRunnerState - - -class CloudRunnerManager(abc.ABC): - """Manage runner instance on cloud. - - Attributes: - name_prefix: The name prefix of the self-hosted runners. - """ - - @property - @abc.abstractmethod - def name_prefix(self) -> str: - """Get the name prefix of the self-hosted runners.""" - - @abc.abstractmethod - def create_runner(self, registration_token: str) -> InstanceId: - """Create a self-hosted runner. - - Args: - registration_token: The GitHub registration token for registering runners. - """ - - @abc.abstractmethod - def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance: - """Get a self-hosted runner by instance id. - - Args: - instance_id: The instance id. - """ - - @abc.abstractmethod - def get_runners(self, states: Sequence[CloudRunnerState]) -> Tuple[CloudRunnerInstance]: - """Get self-hosted runners by state. - - Args: - states: Filter for the runners with these github states. If None all states will be - included. - """ - - @abc.abstractmethod - def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMetrics | None: - """Delete self-hosted runner. - - Args: - instance_id: The instance id of the runner to delete. - remove_token: The GitHub remove token. - """ - - @abc.abstractmethod - def flush_runners(self, remove_token: str, busy: bool = False) -> Iterator[RunnerMetrics]: - """Stop all runners. - - Args: - remove_token: The GitHub remove token for removing runners. - busy: If false, only idle runners are removed. If true, both idle and busy runners are - removed. - """ - - @abc.abstractmethod - def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: - """Cleanup runner and resource on the cloud. - - Perform health check on runner and delete the runner if it fails. - - Args: - remove_token: The GitHub remove token for removing runners. - """ diff --git a/src/manager/github_runner_manager.py b/src/manager/github_runner_manager.py deleted file mode 100644 index 0aed972bd..000000000 --- a/src/manager/github_runner_manager.py +++ /dev/null @@ -1,127 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Client for managing self-hosted runner on GitHub side.""" - -from enum import Enum, auto -from typing import Sequence - -from charm_state import GithubPath -from github_client import GithubClient -from github_type import GitHubRunnerStatus, SelfHostedRunner - - -class GitHubRunnerState(str, Enum): - """State of the self-hosted runner on GitHub. - - Attributes: - BUSY: Runner is working on a job assigned by GitHub. - IDLE: Runner is waiting to take a job or is running pre-job tasks (i.e. - repo-policy-compliance check). - OFFLINE: Runner is not connected to GitHub. - """ - - BUSY = auto() - IDLE = auto() - OFFLINE = auto() - - @staticmethod - def from_runner(runner: SelfHostedRunner) -> "GitHubRunnerState": - """Construct the object from GtiHub runner information. - - Args: - runner: Information on the GitHub self-hosted runner. - - Returns: - The state of runner. - """ - state = GitHubRunnerState.OFFLINE - # A runner that is busy and offline is possible. - if runner.busy: - state = GitHubRunnerState.BUSY - if runner.status == GitHubRunnerStatus.ONLINE: - if not runner.busy: - state = GitHubRunnerState.IDLE - return state - - -class GithubRunnerManager: - """Manage self-hosted runner on GitHub side.""" - - def __init__(self, prefix: str, token: str, path: GithubPath): - """Construct the object. - - Args: - prefix: The prefix in the name to identify the runners managed by this instance. - token: The GitHub personal access token to access the GitHub API. - path: The GitHub repository or organization to register the runners under. - """ - self._prefix = prefix - self._path = path - self.github = GithubClient(token) - - def get_runners( - self, states: Sequence[GitHubRunnerState] | None = None - ) -> tuple[SelfHostedRunner]: - """Get info on self-hosted runners of certain states. - - Args: - states: Filter the runners for these states. If None, all runners are returned. - - Returns: - Information on the runners. - """ - runner_list = self.github.get_runner_github_info(self._path) - return tuple( - runner - for runner in runner_list - if runner.name.startswith(self._prefix) - and GithubRunnerManager._is_runner_in_state(runner, states) - ) - - def delete_runners(self, states: Sequence[GitHubRunnerState] | None = None) -> None: - """Delete the self-hosted runners of certain states. - - Args: - states: Filter the runners for these states. If None, all runners are deleted. - """ - runner_list = self.get_runners(states) - for runner in runner_list: - self.github.delete_runner(self._path, runner.id) - - def get_registration_token(self) -> str: - """Get registration token from GitHub. - - This token is used for registering self-hosted runners. - - Returns: - The registration token. - """ - return self.github.get_runner_registration_token(self._path) - - def get_removal_token(self) -> str: - """Get removal token from GitHub. - - This token is used for removing self-hosted runners. - - Returns: - The removal token. - """ - return self.github.get_runner_remove_token(self._path) - - @staticmethod - def _is_runner_in_state( - runner: SelfHostedRunner, states: Sequence[GitHubRunnerState] | None - ) -> bool: - """Check that the runner is in one of the states provided. - - Args: - runner: Runner to filter. - states: States in which to check the runner belongs to. - - Returns: - True if the runner is in one of the state, else false. - """ - if states is None: - return True - return GitHubRunnerState.from_runner(runner) in states diff --git a/src/manager/runner_manager.py b/src/manager/runner_manager.py deleted file mode 100644 index 048b9c628..000000000 --- a/src/manager/runner_manager.py +++ /dev/null @@ -1,337 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Class for managing the GitHub self-hosted runners hosted on cloud instances.""" - -import logging -from dataclasses import dataclass -from enum import Enum, auto -from multiprocessing import Pool -from typing import Iterator, Sequence, Type, cast - -from charm_state import GithubPath -from errors import GithubMetricsError, RunnerCreateError -from github_type import SelfHostedRunner -from manager.cloud_runner_manager import ( - CloudRunnerInstance, - CloudRunnerManager, - CloudRunnerState, - HealthState, - InstanceId, -) -from manager.github_runner_manager import GithubRunnerManager, GitHubRunnerState -from metrics import events as metric_events -from metrics import github as github_metrics -from metrics import runner as runner_metrics -from metrics.runner import RunnerMetrics - -logger = logging.getLogger(__name__) - -IssuedMetricEventsStats = dict[Type[metric_events.Event], int] - - -class FlushMode(Enum): - """Strategy for flushing runners. - - Attributes: - FLUSH_IDLE: Flush idle runners. - FLUSH_BUSY: Flush busy runners. - """ - - FLUSH_IDLE = auto() - FLUSH_BUSY = auto() - - -@dataclass -class RunnerInstance: - """Represents an instance of runner. - - Attributes: - name: Full name of the runner. Managed by the cloud runner manager. - instance_id: ID of the runner. Managed by the runner manager. - health: The health state of the runner. - github_state: State on github. - cloud_state: State on cloud. - """ - - name: str - instance_id: InstanceId - health: HealthState - github_state: GitHubRunnerState | None - cloud_state: CloudRunnerState - - def __init__(self, cloud_instance: CloudRunnerInstance, github_info: SelfHostedRunner | None): - """Construct an instance. - - Args: - cloud_instance: Information on the cloud instance. - github_info: Information on the GitHub of the runner. - """ - self.name = cloud_instance.name - self.instance_id = cloud_instance.instance_id - self.health = cloud_instance.health - self.github_state = ( - GitHubRunnerState.from_runner(github_info) if github_info is not None else None - ) - self.cloud_state = cloud_instance.state - - -@dataclass -class RunnerManagerConfig: - """Configuration for the runner manager. - - Attributes: - token: GitHub personal access token to query GitHub API. - path: Path to GitHub repository or organization to registry the runners. - """ - - token: str - path: GithubPath - - -class RunnerManager: - """Manage the runners. - - Attributes: - name_prefix: The name prefix of the runners. - """ - - def __init__(self, cloud_runner_manager: CloudRunnerManager, config: RunnerManagerConfig): - """Construct the object. - - Args: - cloud_runner_manager: For managing the cloud instance of the runner. - config: Configuration of this class. - """ - self._config = config - self._cloud = cloud_runner_manager - self.name_prefix = self._cloud.name_prefix - self._github = GithubRunnerManager( - prefix=self.name_prefix, token=self._config.token, path=self._config.path - ) - - def create_runners(self, num: int) -> tuple[InstanceId]: - """Create runners. - - Args: - num: Number of runners to create. - - Returns: - List of instance ID of the runners. - """ - logger.info("Creating %s runners", num) - registration_token = self._github.get_registration_token() - - create_runner_args = [ - RunnerManager._CreateRunnerArgs(self._cloud, registration_token) for _ in range(num) - ] - instance_id_list = [] - with Pool(processes=min(num, 10)) as pool: - jobs = pool.imap_unordered( - func=RunnerManager._create_runner, iterable=create_runner_args - ) - for _ in range(num): - try: - instance_id = next(jobs) - except RunnerCreateError: - logger.exception("Failed to spawn a runner.") - except StopIteration: - break - else: - instance_id_list.append(instance_id) - return tuple(instance_id_list) - - def get_runners( - self, - github_states: Sequence[GitHubRunnerState] | None = None, - cloud_states: Sequence[CloudRunnerState] | None = None, - ) -> tuple[RunnerInstance]: - """Get information on runner filter by state. - - Only runners that has cloud instance are returned. - - Args: - github_states: Filter for the runners with these github states. If None all - states will be included. - cloud_states: Filter for the runners with these cloud states. If None all states - will be included. - - Returns: - Information on the runners. - """ - logger.info("Getting runners...") - github_infos = self._github.get_runners(github_states) - cloud_infos = self._cloud.get_runners(cloud_states) - github_infos_map = {info.name: info for info in github_infos} - cloud_infos_map = {info.name: info for info in cloud_infos} - logger.info( - "Found following runners: %s", cloud_infos_map.keys() | github_infos_map.keys() - ) - - runner_names = cloud_infos_map.keys() & github_infos_map.keys() - cloud_only = cloud_infos_map.keys() - runner_names - github_only = github_infos_map.keys() - runner_names - if cloud_only: - logger.warning( - "Found runner instance on cloud but not registered on GitHub: %s", cloud_only - ) - if github_only: - logger.warning( - "Found self-hosted runner on GitHub but no matching runner instance on cloud: %s", - github_only, - ) - - runner_instances: list[RunnerInstance] = [ - RunnerInstance( - cloud_infos_map[name], github_infos_map[name] if name in github_infos_map else None - ) - for name in cloud_infos_map.keys() - ] - if cloud_states is not None: - runner_instances = [ - runner for runner in runner_instances if runner.cloud_state in cloud_states - ] - if github_states is not None: - runner_instances = [ - runner - for runner in runner_instances - if runner.github_state is not None and runner.github_state in github_states - ] - return cast(tuple[RunnerInstance], tuple(runner_instances)) - - def delete_runners(self, num: int) -> IssuedMetricEventsStats: - """Delete runners. - - Args: - num: The number of runner to delete. - - Returns: - Stats on metrics events issued during the deletion of runners. - """ - logger.info("Deleting %s number of runners", num) - runners_list = self.get_runners()[:num] - runner_names = [runner.name for runner in runners_list] - logger.info("Deleting runners: %s", runner_names) - remove_token = self._github.get_removal_token() - return self._delete_runners(runners=runners_list, remove_token=remove_token) - - def flush_runners( - self, flush_mode: FlushMode = FlushMode.FLUSH_IDLE - ) -> IssuedMetricEventsStats: - """Delete runners according to state. - - Args: - flush_mode: The type of runners affect by the deletion. - - Returns: - Stats on metrics events issued during the deletion of runners. - """ - match flush_mode: - case FlushMode.FLUSH_IDLE: - logger.info("Flushing idle runners...") - case FlushMode.FLUSH_BUSY: - logger.info("Flushing idle and busy runners...") - case _: - logger.critical( - "Unknown flush mode %s encountered, contact developers", flush_mode - ) - - busy = False - if flush_mode == FlushMode.FLUSH_BUSY: - busy = True - remove_token = self._github.get_removal_token() - stats = self._cloud.flush_runners(remove_token, busy) - return self._issue_runner_metrics(metrics=stats) - - def cleanup(self) -> IssuedMetricEventsStats: - """Run cleanup of the runners and other resources. - - Returns: - Stats on metrics events issued during the cleanup of runners. - """ - self._github.delete_runners([GitHubRunnerState.OFFLINE]) - remove_token = self._github.get_removal_token() - deleted_runner_metrics = self._cloud.cleanup(remove_token) - return self._issue_runner_metrics(metrics=deleted_runner_metrics) - - def _delete_runners( - self, runners: Sequence[RunnerInstance], remove_token: str - ) -> IssuedMetricEventsStats: - """Delete list of runners. - - Args: - runners: The runners to delete. - remove_token: The token for removing self-hosted runners. - - Returns: - Stats on metrics events issued during the deletion of runners. - """ - runner_metrics_list = [] - for runner in runners: - deleted_runner_metrics = self._cloud.delete_runner( - instance_id=runner.instance_id, remove_token=remove_token - ) - if deleted_runner_metrics is not None: - runner_metrics_list.append(deleted_runner_metrics) - return self._issue_runner_metrics(metrics=iter(runner_metrics_list)) - - def _issue_runner_metrics(self, metrics: Iterator[RunnerMetrics]) -> IssuedMetricEventsStats: - """Issue runner metrics. - - Args: - metrics: Runner metrics to issue. - - Returns: - Stats on runner metrics issued. - """ - total_stats: IssuedMetricEventsStats = {} - - for extracted_metrics in metrics: - try: - job_metrics = github_metrics.job( - github_client=self._github.github, - pre_job_metrics=extracted_metrics.pre_job, - runner_name=extracted_metrics.runner_name, - ) - except GithubMetricsError: - logger.exception( - "Failed to calculate job metrics for %s", extracted_metrics.runner_name - ) - job_metrics = None - - issued_events = runner_metrics.issue_events( - runner_metrics=extracted_metrics, - job_metrics=job_metrics, - flavor=self.name_prefix, - ) - - for event_type in issued_events: - total_stats[event_type] = total_stats.get(event_type, 0) + 1 - - return total_stats - - @dataclass - class _CreateRunnerArgs: - """Arguments for the _create_runner function. - - Attrs: - cloud_runner_manager: For managing the cloud instance of the runner. - registration_token: The GitHub provided-token for registering runners. - """ - - cloud_runner_manager: CloudRunnerManager - registration_token: str - - @staticmethod - def _create_runner(args: _CreateRunnerArgs) -> InstanceId: - """Create a single runner. - - This is a staticmethod for usage with multiprocess.Pool. - - Args: - args: The arguments. - - Returns: - The instance ID of the runner created. - """ - return args.cloud_runner_manager.create_runner(registration_token=args.registration_token) diff --git a/src/metrics/__init__.py b/src/metrics/__init__.py deleted file mode 100644 index d2a48eaed..000000000 --- a/src/metrics/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Package for common metrics-related code.""" diff --git a/src/metrics/events.py b/src/metrics/events.py deleted file mode 100644 index 6f858166d..000000000 --- a/src/metrics/events.py +++ /dev/null @@ -1,167 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Models and functions for the metric events.""" -import logging -from pathlib import Path -from typing import Any, Optional - -from pydantic import BaseModel, NonNegativeFloat - -from errors import IssueMetricEventError - -METRICS_LOG_PATH = Path("/var/log/github-runner-metrics.log") - - -logger = logging.getLogger(__name__) - - -class Event(BaseModel): - """Base class for metric events. - - Attributes: - timestamp: The UNIX time stamp of the time at which the event was originally issued. - event: The name of the event. Will be set to the class name in snake case if not provided. - """ - - timestamp: NonNegativeFloat - event: str - - @staticmethod - def _camel_to_snake(camel_case_string: str) -> str: - """Convert a camel case string to snake case. - - Args: - camel_case_string: The string to convert. - - Returns: - The converted string. - """ - snake_case_string = camel_case_string[0].lower() - for char in camel_case_string[1:]: - if char.isupper(): - snake_case_string += "_" + char.lower() - else: - snake_case_string += char - return snake_case_string - - def __init__(self, *args: Any, **kwargs: Any): - """Initialize the event. - - Args: - args: The positional arguments to pass to the base class. - kwargs: The keyword arguments to pass to the base class. These are used to set the - specific fields. E.g. timestamp=12345 will set the timestamp field to 12345. - """ - if "event" not in kwargs: - event = self._camel_to_snake(self.__class__.__name__) - kwargs["event"] = event - super().__init__(*args, **kwargs) - - -class RunnerInstalled(Event): - """Metric event for when a runner is installed. - - Attributes: - flavor: Describes the characteristics of the runner. - The flavor could be for example "small". - duration: The duration of the installation in seconds. - """ - - flavor: str - duration: NonNegativeFloat - - -class RunnerStart(Event): - """Metric event for when a runner is started. - - Attributes: - flavor: Describes the characteristics of the runner. - The flavor could be for example "small". - workflow: The workflow name. - repo: The repository name. - github_event: The github event. - idle: The idle time in seconds. - queue_duration: The time in seconds it took before the runner picked up the job. - This is optional as we rely on the Github API and there may be problems - retrieving the data. - """ - - flavor: str - workflow: str - repo: str - github_event: str - idle: NonNegativeFloat - queue_duration: Optional[NonNegativeFloat] - - -class CodeInformation(BaseModel): - """Information about a status code. - - This could e.g. be an exit code or a http status code. - - Attributes: - code: The status code. - """ - - code: int - - -class RunnerStop(Event): - """Metric event for when a runner is stopped. - - Attributes: - flavor: Describes the characteristics of the runner. - The flavor could be for example "small". - workflow: The workflow name. - repo: The repository name. - github_event: The github event. - status: A string describing the reason for stopping the runner. - status_info: More information about the status. - job_duration: The duration of the job in seconds. - job_conclusion: The job conclusion, e.g. "success", "failure", ... - """ - - flavor: str - workflow: str - repo: str - github_event: str - status: str - status_info: Optional[CodeInformation] - job_duration: NonNegativeFloat - job_conclusion: Optional[str] - - -class Reconciliation(Event): - """Metric event for when the charm has finished reconciliation. - - Attributes: - flavor: Describes the characteristics of the runner. - The flavor could be for example "small". - crashed_runners: The number of crashed runners. - idle_runners: The number of idle runners. - duration: The duration of the reconciliation in seconds. - """ - - flavor: str - crashed_runners: int - idle_runners: int - duration: NonNegativeFloat - - -def issue_event(event: Event) -> None: - """Issue a metric event. - - The metric event is logged to the metrics log. - - Args: - event: The metric event to log. - - Raises: - IssueMetricEventError: If the event cannot be logged. - """ - try: - with METRICS_LOG_PATH.open(mode="a", encoding="utf-8") as metrics_file: - metrics_file.write(f"{event.json(exclude_none=True)}\n") - except OSError as exc: - raise IssueMetricEventError(f"Cannot write to {METRICS_LOG_PATH}") from exc diff --git a/src/metrics/github.py b/src/metrics/github.py deleted file mode 100644 index 354933fea..000000000 --- a/src/metrics/github.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Functions to calculate metrics from data retrieved from GitHub.""" -import logging - -from charm_state import GithubRepo -from errors import GithubMetricsError, JobNotFoundError -from github_client import GithubClient -from metrics.runner import PreJobMetrics -from metrics.type import GithubJobMetrics - -logger = logging.getLogger(__name__) - - -def job( - github_client: GithubClient, pre_job_metrics: PreJobMetrics, runner_name: str -) -> GithubJobMetrics: - """Calculate the job metrics for a runner. - - The Github API is accessed to retrieve the job data for the runner. - - Args: - github_client: The GitHub API client. - pre_job_metrics: The pre-job metrics. - runner_name: The name of the runner. - - Raises: - GithubMetricsError: If the job for given workflow run is not found. - - Returns: - The job metrics. - """ - owner, repo = pre_job_metrics.repository.split("/", maxsplit=1) - - try: - job_info = github_client.get_job_info( - path=GithubRepo(owner=owner, repo=repo), - workflow_run_id=pre_job_metrics.workflow_run_id, - runner_name=runner_name, - ) - except JobNotFoundError as exc: - raise GithubMetricsError from exc - logger.debug( - "Job info for runner %s with workflow run id %s: %s", - runner_name, - pre_job_metrics.workflow_run_id, - job_info, - ) - - queue_duration = (job_info.started_at - job_info.created_at).total_seconds() - - return GithubJobMetrics(queue_duration=queue_duration, conclusion=job_info.conclusion) diff --git a/src/metrics/runner.py b/src/metrics/runner.py deleted file mode 100644 index b0ccc191a..000000000 --- a/src/metrics/runner.py +++ /dev/null @@ -1,470 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Classes and function to extract the metrics from storage and issue runner metrics events.""" - -import json -import logging -from enum import Enum -from json import JSONDecodeError -from pathlib import Path -from typing import Iterator, Optional, Type - -from pydantic import BaseModel, Field, NonNegativeFloat, ValidationError - -from errors import ( - CorruptMetricDataError, - DeleteMetricsStorageError, - IssueMetricEventError, - RunnerMetricsError, -) -from metrics import events as metric_events -from metrics.storage import MetricsStorage -from metrics.storage import StorageManager as MetricsStorageManager -from metrics.storage import move_to_quarantine -from metrics.type import GithubJobMetrics - -logger = logging.getLogger(__name__) - -FILE_SIZE_BYTES_LIMIT = 1024 -PRE_JOB_METRICS_FILE_NAME = "pre-job-metrics.json" -POST_JOB_METRICS_FILE_NAME = "post-job-metrics.json" -RUNNER_INSTALLED_TS_FILE_NAME = "runner-installed.timestamp" - - -class PreJobMetrics(BaseModel): - """Metrics for the pre-job phase of a runner. - - Attributes: - timestamp: The UNIX time stamp of the time at which the event was originally issued. - workflow: The workflow name. - workflow_run_id: The workflow run id. - repository: The repository path in the format '/'. - event: The github event. - """ - - timestamp: NonNegativeFloat - workflow: str - workflow_run_id: str - repository: str = Field(None, regex=r"^.+/.+$") - event: str - - -class PostJobStatus(str, Enum): - """The status of the post-job phase of a runner. - - Attributes: - NORMAL: Represents a normal post-job. - ABNORMAL: Represents an error with post-job. - REPO_POLICY_CHECK_FAILURE: Represents an error with repo-policy-compliance check. - """ - - NORMAL = "normal" - ABNORMAL = "abnormal" - REPO_POLICY_CHECK_FAILURE = "repo-policy-check-failure" - - -class CodeInformation(BaseModel): - """Information about a status code. - - Attributes: - code: The status code. - """ - - code: int - - -class PostJobMetrics(BaseModel): - """Metrics for the post-job phase of a runner. - - Attributes: - timestamp: The UNIX time stamp of the time at which the event was originally issued. - status: The status of the job. - status_info: More information about the status. - """ - - timestamp: NonNegativeFloat - status: PostJobStatus - status_info: Optional[CodeInformation] - - -class RunnerMetrics(BaseModel): - """Metrics for a runner. - - Attributes: - installed_timestamp: The UNIX time stamp of the time at which the runner was installed. - pre_job: The metrics for the pre-job phase. - post_job: The metrics for the post-job phase. - runner_name: The name of the runner. - """ - - installed_timestamp: NonNegativeFloat - pre_job: PreJobMetrics - post_job: Optional[PostJobMetrics] - runner_name: str - - -def extract( - metrics_storage_manager: MetricsStorageManager, runners: set[str], include: bool = False -) -> Iterator[RunnerMetrics]: - """Extract metrics from runners. - - The metrics are extracted from the metrics storage of the runners. - Orphan storages are cleaned up. - - If corrupt data is found, the metrics are not processed further and the storage is moved - to a special quarantine directory, as this may indicate that a malicious - runner is trying to manipulate the files on the storage. - - In order to avoid DoS attacks, the file size is also checked. - - Args: - metrics_storage_manager: The metrics storage manager. - runners: The runners to include or exclude. - include: If true the provided runners are included for metric extraction, else the provided - runners are excluded. - - Yields: - Extracted runner metrics of a particular runner. - """ - for ms in metrics_storage_manager.list_all(): - if (include and ms.runner_name in runners) or ( - not include and ms.runner_name not in runners - ): - runner_metrics = _extract_storage( - metrics_storage_manager=metrics_storage_manager, metrics_storage=ms - ) - if not runner_metrics: - logger.warning("Not able to issue metrics for runner %s", ms.runner_name) - else: - yield runner_metrics - - -def issue_events( - runner_metrics: RunnerMetrics, - flavor: str, - job_metrics: Optional[GithubJobMetrics], -) -> set[Type[metric_events.Event]]: - """Issue the metrics events for a runner. - - Args: - runner_metrics: The metrics for the runner. - flavor: The flavor of the runner. - job_metrics: The metrics about the job run by the runner. - - Returns: - A set of issued events. - """ - runner_start_event = _create_runner_start(runner_metrics, flavor, job_metrics) - - issued_events = set() - try: - metric_events.issue_event(runner_start_event) - except ValidationError: - logger.exception( - "Not able to issue RunnerStart metric for " - "runner %s with pre-job metrics %s and job_metrics %s." - "Will not issue RunnerStop metric.", - runner_metrics.runner_name, - runner_metrics.pre_job, - job_metrics, - ) - except IssueMetricEventError: - logger.exception( - "Not able to issue RunnerStart metric for runner %s. " - "Will not issue RunnerStop metric.", - runner_metrics.runner_name, - ) - else: - issued_events = {metric_events.RunnerStart} - - # Return to not issuing RunnerStop metrics if RunnerStart metric could not be issued. - if not issued_events: - return issued_events - - if runner_metrics.post_job: - runner_stop_event = _create_runner_stop(runner_metrics, flavor, job_metrics) - - try: - metric_events.issue_event(runner_stop_event) - except ValidationError: - logger.exception( - "Not able to issue RunnerStop metric for " - "runner %s with pre-job metrics %s, post-job metrics %s and job_metrics %s.", - runner_metrics.runner_name, - runner_metrics.pre_job, - runner_metrics.post_job, - job_metrics, - ) - except IssueMetricEventError: - logger.exception( - "Not able to issue RunnerStop metric for runner %s.", runner_metrics.runner_name - ) - return issued_events - - issued_events.add(metric_events.RunnerStop) - - return issued_events - - -def _create_runner_start( - runner_metrics: RunnerMetrics, flavor: str, job_metrics: Optional[GithubJobMetrics] -) -> metric_events.RunnerStart: - """Create the RunnerStart event. - - Args: - runner_metrics: The metrics for the runner. - flavor: The flavor of the runner. - job_metrics: The metrics about the job run by the runner. - - Returns: - The RunnerStart event. - """ - # When a job gets picked up directly after spawning, the runner_metrics installed timestamp - # might be higher than the pre-job timestamp. This is due to the fact that we issue the runner - # installed timestamp for Openstack after waiting with delays for the runner to be ready. - # We set the idle_duration to 0 in this case. - if runner_metrics.pre_job.timestamp < runner_metrics.installed_timestamp: - logger.warning( - "Pre-job timestamp %d is before installed timestamp %d for runner %s." - " Setting idle_duration to zero", - runner_metrics.pre_job.timestamp, - runner_metrics.installed_timestamp, - runner_metrics.runner_name, - ) - idle_duration = max(runner_metrics.pre_job.timestamp - runner_metrics.installed_timestamp, 0) - - # GitHub API returns started_at < created_at in some rare cases. - if job_metrics and job_metrics.queue_duration < 0: - logger.warning( - "Queue duration for runner %s is negative: %f. Setting it to zero.", - runner_metrics.runner_name, - job_metrics.queue_duration, - ) - queue_duration = max(job_metrics.queue_duration, 0) if job_metrics else None - - return metric_events.RunnerStart( - timestamp=runner_metrics.pre_job.timestamp, - flavor=flavor, - workflow=runner_metrics.pre_job.workflow, - repo=runner_metrics.pre_job.repository, - github_event=runner_metrics.pre_job.event, - idle=idle_duration, - queue_duration=queue_duration, - ) - - -def _create_runner_stop( - runner_metrics: RunnerMetrics, flavor: str, job_metrics: GithubJobMetrics -) -> metric_events.RunnerStop: - """Create the RunnerStop event. - - Expects that the runner_metrics.post_job is not None. - - Args: - runner_metrics: The metrics for the runner. - flavor: The flavor of the runner. - job_metrics: The metrics about the job run by the runner. - - Raises: - RunnerMetricsError: Post job runner metric not found. Should not happen. - - Returns: - The RunnerStop event. - """ - if runner_metrics.post_job is None: - raise RunnerMetricsError( - "Post job runner metric not found during RunnerStop event, contact developers" - ) - - # When a job gets cancelled directly after spawning, - # the post-job timestamp might be lower then the pre-job timestamp. - # This is due to the fact that we don't have a real post-job script but rather use - # the exit code of the runner application which might exit before the pre-job script - # job is done in edge cases. See also: - # https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/running-scripts-before-or-after-a-job#triggering-the-scripts - # We set the job_duration to 0 in this case. - if runner_metrics.post_job.timestamp < runner_metrics.pre_job.timestamp: - logger.warning( - "Post-job timestamp %d is before pre-job timestamp %d for runner %s." - " Setting job_duration to zero", - runner_metrics.post_job.timestamp, - runner_metrics.pre_job.timestamp, - runner_metrics.runner_name, - ) - job_duration = max(runner_metrics.post_job.timestamp - runner_metrics.pre_job.timestamp, 0) - - return metric_events.RunnerStop( - timestamp=runner_metrics.post_job.timestamp, - flavor=flavor, - workflow=runner_metrics.pre_job.workflow, - repo=runner_metrics.pre_job.repository, - github_event=runner_metrics.pre_job.event, - status=runner_metrics.post_job.status, - status_info=runner_metrics.post_job.status_info, - job_duration=job_duration, - job_conclusion=job_metrics.conclusion if job_metrics else None, - ) - - -def _extract_storage( - metrics_storage_manager: MetricsStorageManager, - metrics_storage: MetricsStorage, -) -> Optional[RunnerMetrics]: - """Extract metrics from a metrics storage. - - Args: - metrics_storage_manager: The metrics storage manager. - metrics_storage: The metrics storage for a specific runner. - - Returns: - The extracted metrics if at least the pre-job metrics are present. - """ - runner_name = metrics_storage.runner_name - try: - logger.debug("Extracting metrics from metrics storage for runner %s", runner_name) - metrics_from_fs = _extract_metrics_from_storage(metrics_storage) - except CorruptMetricDataError: - logger.exception("Corrupt metric data found for runner %s", runner_name) - move_to_quarantine(metrics_storage_manager, runner_name) - return None - - logger.debug("Cleaning metrics storage for runner %s", runner_name) - _clean_up_storage( - metrics_storage_manager=metrics_storage_manager, metrics_storage=metrics_storage - ) - return metrics_from_fs - - -def _extract_metrics_from_storage(metrics_storage: MetricsStorage) -> Optional[RunnerMetrics]: - """Extract metrics from metrics storage for a runner. - - Args: - metrics_storage: The metrics storage for a specific runner. - - Returns: - The extracted metrics if at least the pre-job metrics are present. - - Raises: - CorruptMetricDataError: Raised if one of the files is not valid or too large. - """ - if too_large_files := _inspect_file_sizes(metrics_storage): - raise CorruptMetricDataError( - f"File size of {too_large_files} is too large. " - f"The limit is {FILE_SIZE_BYTES_LIMIT} bytes." - ) - - runner_name = metrics_storage.runner_name - try: - installed_timestamp = metrics_storage.path.joinpath( - RUNNER_INSTALLED_TS_FILE_NAME - ).read_text() - logger.debug("Runner %s installed at %s", runner_name, installed_timestamp) - except FileNotFoundError: - logger.exception("installed_timestamp not found for runner %s", runner_name) - return None - - try: - pre_job_metrics = _extract_file_from_storage( - metrics_storage=metrics_storage, filename=PRE_JOB_METRICS_FILE_NAME - ) - if not pre_job_metrics: - return None - logger.debug("Pre-job metrics for runner %s: %s", runner_name, pre_job_metrics) - - post_job_metrics = _extract_file_from_storage( - metrics_storage=metrics_storage, filename=POST_JOB_METRICS_FILE_NAME - ) - logger.debug("Post-job metrics for runner %s: %s", runner_name, post_job_metrics) - # TODO: 2024-04-02 - We should define a new error, wrap it and re-raise it. - except CorruptMetricDataError: # pylint: disable=try-except-raise - raise - - try: - return RunnerMetrics( - installed_timestamp=installed_timestamp, - pre_job=PreJobMetrics(**pre_job_metrics), - post_job=PostJobMetrics(**post_job_metrics) if post_job_metrics else None, - runner_name=runner_name, - ) - except ValidationError as exc: - raise CorruptMetricDataError(str(exc)) from exc - - -def _inspect_file_sizes(metrics_storage: MetricsStorage) -> tuple[Path, ...]: - """Inspect the file sizes of the metrics storage. - - Args: - metrics_storage: The metrics storage for a specific runner. - - Returns: - A tuple of files whose size is larger than the limit. - """ - files: list[Path] = [ - metrics_storage.path.joinpath(PRE_JOB_METRICS_FILE_NAME), - metrics_storage.path.joinpath(POST_JOB_METRICS_FILE_NAME), - metrics_storage.path.joinpath(RUNNER_INSTALLED_TS_FILE_NAME), - ] - - return tuple( - filter(lambda file: file.exists() and file.stat().st_size > FILE_SIZE_BYTES_LIMIT, files) - ) - - -def _extract_file_from_storage(metrics_storage: MetricsStorage, filename: str) -> dict | None: - """Extract a particular metric file from metrics storage. - - Args: - metrics_storage: The metrics storage for a specific runner. - filename: The metrics filename. - - Raises: - CorruptMetricDataError: If any errors have been found within the metric. - - Returns: - Metrics for the given runner if present. - """ - try: - job_metrics = json.loads( - metrics_storage.path.joinpath(filename).read_text(encoding="utf-8") - ) - except FileNotFoundError: - logger.warning("%s not found for runner %s.", filename, metrics_storage.runner_name) - return None - except JSONDecodeError as exc: - raise CorruptMetricDataError(str(exc)) from exc - if not isinstance(job_metrics, dict): - raise CorruptMetricDataError( - f"{filename} metrics for runner {metrics_storage.runner_name} is not a JSON object." - ) - return job_metrics - - -def _clean_up_storage( - metrics_storage_manager: MetricsStorageManager, metrics_storage: MetricsStorage -) -> None: - """Clean up the metrics storage. - - Remove all metric files and afterwards the storage. - - Args: - metrics_storage_manager: The metrics storage manager. - metrics_storage: The metrics storage for a specific runner. - """ - try: - metrics_storage.path.joinpath(RUNNER_INSTALLED_TS_FILE_NAME).unlink(missing_ok=True) - metrics_storage.path.joinpath(PRE_JOB_METRICS_FILE_NAME).unlink(missing_ok=True) - metrics_storage.path.joinpath(POST_JOB_METRICS_FILE_NAME).unlink(missing_ok=True) - except OSError: - logger.exception( - "Could not remove metric files for runner %s, " - "this may lead to duplicate metrics issued", - metrics_storage.runner_name, - ) - - try: - metrics_storage_manager.delete(metrics_storage.runner_name) - except DeleteMetricsStorageError: - logger.exception( - "Could not delete metrics storage for runner %s.", metrics_storage.runner_name - ) diff --git a/src/metrics/runner_logs.py b/src/metrics/runner_logs.py deleted file mode 100644 index ec7923c9c..000000000 --- a/src/metrics/runner_logs.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Functions to pull and remove the logs of the crashed runners.""" - -import logging -import shutil -import time -from datetime import datetime -from pathlib import Path - -RUNNER_LOGS_DIR_PATH = Path("/var/log/github-runner-logs") - -SYSLOG_PATH = Path("/var/log/syslog") - -OUTDATED_LOGS_IN_SECONDS = 7 * 24 * 60 * 60 - -logger = logging.getLogger(__name__) - - -def create_logs_dir(runner_name: str) -> Path: - """Create the directory to store the logs of the crashed runners. - - Args: - runner_name: The name of the runner. - - Returns: - The path to the directory where the logs of the crashed runners will be stored. - """ - target_log_path = RUNNER_LOGS_DIR_PATH / runner_name - target_log_path.mkdir(parents=True, exist_ok=True) - - return target_log_path - - -def remove_outdated() -> None: - """Remove the logs that are too old.""" - maxage_absolute = time.time() - OUTDATED_LOGS_IN_SECONDS - dt_object = datetime.fromtimestamp(maxage_absolute) - logger.info( - "Removing the outdated logs of the crashed runners. " - "All logs older than %s will be removed.", - dt_object.strftime("%Y-%m-%d %H:%M:%S"), - ) - - for log_path in RUNNER_LOGS_DIR_PATH.glob("*"): - if log_path.is_dir() and (log_path.stat().st_mtime < maxage_absolute): - logger.info("Removing the outdated logs of the runner %s.", log_path.name) - try: - shutil.rmtree(log_path) - except OSError: - logger.exception( - "Unable to remove the outdated logs of the runner %s.", log_path.name - ) diff --git a/src/metrics/storage.py b/src/metrics/storage.py deleted file mode 100644 index c9b41a2f5..000000000 --- a/src/metrics/storage.py +++ /dev/null @@ -1,192 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Classes and functions defining the metrics storage. - -It contains a protocol and reference implementation. -""" -import logging -import shutil -import tarfile -from dataclasses import dataclass -from pathlib import Path -from typing import Callable, Iterator, Protocol - -from errors import ( - CreateMetricsStorageError, - DeleteMetricsStorageError, - GetMetricsStorageError, - QuarantineMetricsStorageError, -) - -FILESYSTEM_OWNER = "ubuntu:ubuntu" -FILESYSTEM_BASE_PATH = Path("/home/ubuntu/runner-fs") -FILESYSTEM_QUARANTINE_PATH = Path("/home/ubuntu/runner-fs-quarantine") - -logger = logging.getLogger(__name__) - - -@dataclass -class MetricsStorage: - """Storage for the metrics. - - Attributes: - path: The path to the directory holding the metrics inside the charm. - runner_name: The name of the associated runner. - """ - - path: Path - runner_name: str - - -class StorageManager(Protocol): # pylint: disable=too-few-public-methods - """A protocol defining the methods for managing the metrics storage. - - Attributes: - create: Method to create a new storage. Returns the created storage. - Raises an exception CreateMetricsStorageError if the storage already exists. - list_all: Method to list all storages. - get: Method to get a storage by name. - delete: Method to delete a storage by name. - """ - - create: Callable[[str], MetricsStorage] - list_all: Callable[[], Iterator[MetricsStorage]] - get: Callable[[str], MetricsStorage] - delete: Callable[[str], None] - - -def _get_runner_fs_path(runner_name: str) -> Path: - """Get the path of the runner shared filesystem. - - Args: - runner_name: The name of the runner. - - Returns: - The path of the runner shared filesystem. - """ - return FILESYSTEM_BASE_PATH / runner_name - - -def create(runner_name: str) -> MetricsStorage: - """Create metrics storage for the runner. - - The method is not idempotent and will raise an exception - if the storage already exists. - - Args: - runner_name: The name of the runner. - - Returns: - The metrics storage object. - - Raises: - CreateMetricsStorageError: If the creation of the shared filesystem fails. - """ - try: - FILESYSTEM_BASE_PATH.mkdir(exist_ok=True) - FILESYSTEM_QUARANTINE_PATH.mkdir(exist_ok=True) - except OSError as exc: - raise CreateMetricsStorageError("Failed to create metrics storage directories") from exc - - runner_fs_path = _get_runner_fs_path(runner_name) - - try: - runner_fs_path.mkdir() - except FileExistsError as exc: - raise CreateMetricsStorageError( - f"Metrics storage for runner {runner_name} already exists." - ) from exc - - return MetricsStorage(runner_fs_path, runner_name) - - -def list_all() -> Iterator[MetricsStorage]: - """List all the metric storages. - - Yields: - A metrics storage object. - """ - if not FILESYSTEM_BASE_PATH.exists(): - return - - directories = (entry for entry in FILESYSTEM_BASE_PATH.iterdir() if entry.is_dir()) - for directory in directories: - try: - fs = get(runner_name=directory.name) - except GetMetricsStorageError: - logger.error("Failed to get metrics storage for runner %s", directory.name) - else: - yield fs - - -def get(runner_name: str) -> MetricsStorage: - """Get the metrics storage for the runner. - - Args: - runner_name: The name of the runner. - - Returns: - The metrics storage object. - - Raises: - GetMetricsStorageError: If the storage does not exist. - """ - runner_fs_path = _get_runner_fs_path(runner_name) - if not runner_fs_path.exists(): - raise GetMetricsStorageError(f"Metrics storage for runner {runner_name} not found.") - - return MetricsStorage(runner_fs_path, runner_name) - - -def delete(runner_name: str) -> None: - """Delete the metrics storage for the runner. - - Args: - runner_name: The name of the runner. - - Raises: - DeleteMetricsStorageError: If the storage could not be deleted. - """ - runner_fs_path = _get_runner_fs_path(runner_name=runner_name) - - try: - shutil.rmtree(runner_fs_path) - except OSError as exc: - raise DeleteMetricsStorageError( - f"Failed to remove metrics storage for runner {runner_name}" - ) from exc - - -def move_to_quarantine(storage_manager: StorageManager, runner_name: str) -> None: - """Archive the metrics storage for the runner and delete it. - - Args: - storage_manager: The storage manager. - runner_name: The name of the runner. - - Raises: - QuarantineMetricsStorageError: If the metrics storage could not be quarantined. - """ - try: - runner_fs = storage_manager.get(runner_name) - except GetMetricsStorageError as exc: - raise QuarantineMetricsStorageError( - f"Failed to get metrics storage for runner {runner_name}" - ) from exc - - tarfile_path = FILESYSTEM_QUARANTINE_PATH.joinpath(runner_name).with_suffix(".tar.gz") - try: - with tarfile.open(tarfile_path, "w:gz") as tar: - tar.add(runner_fs.path, arcname=runner_fs.path.name) - except OSError as exc: - raise QuarantineMetricsStorageError( - f"Failed to archive metrics storage for runner {runner_name}" - ) from exc - - try: - storage_manager.delete(runner_name) - except DeleteMetricsStorageError as exc: - raise QuarantineMetricsStorageError( - f"Failed to delete metrics storage for runner {runner_name}" - ) from exc diff --git a/src/metrics/type.py b/src/metrics/type.py deleted file mode 100644 index fd45314f6..000000000 --- a/src/metrics/type.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -# Copyright 2023 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Data types used by modules handling metrics.""" - -from typing import NamedTuple, Optional - -from github_type import JobConclusion - - -class GithubJobMetrics(NamedTuple): - """Metrics about a job. - - Attributes: - queue_duration: The time in seconds the job took before the runner picked it up. - conclusion: The conclusion of the job. - """ - - queue_duration: float - conclusion: Optional[JobConclusion] diff --git a/src/openstack_cloud/__init__.py b/src/openstack_cloud/__init__.py deleted file mode 100644 index 3f9935aab..000000000 --- a/src/openstack_cloud/__init__.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Module for managing Openstack cloud.""" - -import logging -from pathlib import Path -from typing import TypedDict, cast - -import yaml - -from errors import OpenStackInvalidConfigError - -logger = logging.getLogger(__name__) - - -CLOUDS_YAML_PATH = Path(Path.home() / ".config/openstack/clouds.yaml") - - -class CloudConfig(TypedDict): - """The parsed clouds.yaml configuration dictionary. - - Attributes: - clouds: A mapping of key "clouds" to cloud name mapped to cloud configuration. - """ - - clouds: dict[str, dict] - - -def _validate_cloud_config(cloud_config: dict) -> CloudConfig: - """Validate the format of the cloud configuration. - - Args: - cloud_config: The configuration in clouds.yaml format to validate. - - Raises: - OpenStackInvalidConfigError: if the format of the config is invalid. - - Returns: - A typed cloud_config dictionary. - """ - # dict of format: {clouds: : } - try: - clouds = list(cloud_config["clouds"].keys()) - except KeyError as exc: - raise OpenStackInvalidConfigError("Missing key 'clouds' from config.") from exc - if not clouds: - raise OpenStackInvalidConfigError("No clouds defined in clouds.yaml.") - return cast(CloudConfig, cloud_config) - - -def _write_config_to_disk(cloud_config: CloudConfig) -> None: - """Write the cloud configuration to disk. - - Args: - cloud_config: The configuration in clouds.yaml format to write to disk. - """ - CLOUDS_YAML_PATH.parent.mkdir(parents=True, exist_ok=True) - CLOUDS_YAML_PATH.write_text(encoding="utf-8", data=yaml.dump(cloud_config)) - - -def initialize(cloud_config: dict) -> None: - """Initialize Openstack integration. - - Validates config and writes it to disk. - - Raises: - OpenStackInvalidConfigError: If there was an given cloud config. - - Args: - cloud_config: The configuration in clouds.yaml format to apply. - """ - try: - valid_config = _validate_cloud_config(cloud_config) - # TODO: 2024-04-02 - We should define a new error, wrap it and re-raise it. - except OpenStackInvalidConfigError: # pylint: disable=try-except-raise - raise - _write_config_to_disk(valid_config) diff --git a/src/openstack_cloud/openstack_cloud.py b/src/openstack_cloud/openstack_cloud.py deleted file mode 100644 index ad21f4d97..000000000 --- a/src/openstack_cloud/openstack_cloud.py +++ /dev/null @@ -1,597 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Class for accessing OpenStack API for managing servers.""" - -import logging -from contextlib import contextmanager -from dataclasses import dataclass -from datetime import datetime -from functools import reduce -from pathlib import Path -from typing import Iterable, Iterator, cast - -import openstack -import openstack.exceptions -import paramiko -import yaml -from fabric import Connection as SSHConnection -from openstack.compute.v2.keypair import Keypair as OpenstackKeypair -from openstack.compute.v2.server import Server as OpenstackServer -from openstack.connection import Connection as OpenstackConnection -from openstack.network.v2.security_group import SecurityGroup as OpenstackSecurityGroup -from paramiko.ssh_exception import NoValidConnectionsError - -from errors import KeyfileError, OpenStackError, SSHError -from utilities import retry - -logger = logging.getLogger(__name__) - -_CLOUDS_YAML_PATH = Path.home() / ".config/openstack/clouds.yaml" - -# Update the version when the security group rules are not backward compatible. -_SECURITY_GROUP_NAME = "github-runner-v1" - -_CREATE_SERVER_TIMEOUT = 5 * 60 -_SSH_TIMEOUT = 30 -_SSH_KEY_PATH = Path("/home/ubuntu/.ssh") -_TEST_STRING = "test_string" - - -@dataclass -class OpenstackInstance: - """Represents an OpenStack instance. - - Attributes: - server_id: ID of server assigned by OpenStack. - server_name: Name of the server on OpenStack. - instance_id: ID used by OpenstackCloud class to manage the instances. See docs on the - OpenstackCloud. - addresses: IP addresses assigned to the server. - status: Status of the server. - """ - - server_id: str - server_name: str - instance_id: str - addresses: list[str] - status: str - - def __init__(self, server: OpenstackServer, prefix: str): - """Construct the object. - - Args: - server: The OpenStack server. - prefix: The name prefix for the servers. - - Raises: - ValueError: Provided server should not be managed under this prefix. - """ - self.server_id = server.id - self.server_name = server.name - self.status = server.status - self.addresses = [ - address["addr"] - for network_addresses in server.addresses.values() - for address in network_addresses - ] - - if not self.server_name.startswith(f"{prefix}-"): - # Should never happen. - raise ValueError( - f"Found openstack server {server.name} managed under prefix {prefix}, contact devs" - ) - self.instance_id = self.server_name[len(prefix) + 1 :] - - -@contextmanager -@retry(tries=2, delay=5, local_logger=logger) -def _get_openstack_connection( - clouds_config: dict[str, dict], cloud: str -) -> Iterator[OpenstackConnection]: - """Create a connection context managed object, to be used within with statements. - - The file of _CLOUDS_YAML_PATH should only be modified by this function. - - Args: - clouds_config: The configuration in clouds.yaml format to apply. - cloud: The name of cloud to use in the clouds.yaml. - - Raises: - OpenStackError: if the credentials provided is not authorized. - - Yields: - An openstack.connection.Connection object. - """ - if not _CLOUDS_YAML_PATH.exists(): - _CLOUDS_YAML_PATH.parent.mkdir(parents=True, exist_ok=True) - - # Concurrency: Very small chance for the file to be corrupted due to multiple process calling - # this function and writing the file at the same time. This should cause the `conn.authorize` - # to fail, and retry of this function would resolve this. - _CLOUDS_YAML_PATH.write_text(data=yaml.dump(clouds_config), encoding="utf-8") - - # api documents that keystoneauth1.exceptions.MissingRequiredOptions can be raised but - # I could not reproduce it. Therefore, no catch here for such exception. - try: - with openstack.connect(cloud=cloud) as conn: - conn.authorize() - yield conn - # pylint thinks this isn't an exception, but does inherit from Exception class. - except openstack.exceptions.HttpException as exc: # pylint: disable=bad-exception-cause - logger.exception("OpenStack API call failure") - raise OpenStackError("Failed OpenStack API call") from exc - - -class OpenstackCloud: - """Client to interact with OpenStack cloud. - - The OpenStack server name is managed by this cloud. Caller refers to the instances via - instance_id. If the caller needs the server name, e.g., for logging, it can be queried with - get_server_name. - """ - - def __init__(self, clouds_config: dict[str, dict], cloud: str, prefix: str): - """Create the object. - - Args: - clouds_config: The openstack clouds.yaml in dict format. - cloud: The name of cloud to use in the clouds.yaml. - prefix: Prefix attached to names of resource managed by this instance. Used for - identifying which resource belongs to this instance. - """ - self._clouds_config = clouds_config - self._cloud = cloud - self.prefix = prefix - - # Ignore "Too many arguments" as 6 args should be fine. Move to a dataclass if new args are - # added. - def launch_instance( # pylint: disable=R0913 - self, instance_id: str, image: str, flavor: str, network: str, cloud_init: str - ) -> OpenstackInstance: - """Create an OpenStack instance. - - Args: - instance_id: The instance ID to form the instance name. - image: The image used to create the instance. - flavor: The flavor used to create the instance. - network: The network used to create the instance. - cloud_init: The cloud init userdata to startup the instance. - - Raises: - OpenStackError: Unable to create OpenStack server. - - Returns: - The OpenStack instance created. - """ - full_name = self.get_server_name(instance_id) - logger.info("Creating openstack server with %s", full_name) - - with _get_openstack_connection( - clouds_config=self._clouds_config, cloud=self._cloud - ) as conn: - security_group = OpenstackCloud._ensure_security_group(conn) - keypair = OpenstackCloud._setup_keypair(conn, full_name) - - try: - server = conn.create_server( - name=full_name, - image=image, - key_name=keypair.name, - flavor=flavor, - network=network, - security_groups=[security_group.id], - userdata=cloud_init, - auto_ip=False, - timeout=_CREATE_SERVER_TIMEOUT, - wait=True, - ) - except openstack.exceptions.ResourceTimeout as err: - logger.exception("Timeout creating openstack server %s", full_name) - logger.info( - "Attempting clean up of openstack server %s that timeout during creation", - full_name, - ) - self._delete_instance(conn, full_name) - raise OpenStackError(f"Timeout creating openstack server {full_name}") from err - except openstack.exceptions.SDKException as err: - logger.exception("Failed to create openstack server %s", full_name) - self._delete_keypair(conn, instance_id) - raise OpenStackError(f"Failed to create openstack server {full_name}") from err - - return OpenstackInstance(server, self.prefix) - - def get_instance(self, instance_id: str) -> OpenstackInstance | None: - """Get OpenStack instance by instance ID. - - Args: - instance_id: The instance ID. - - Returns: - The OpenStack instance if found. - """ - full_name = self.get_server_name(instance_id) - logger.info("Getting openstack server with %s", full_name) - - with _get_openstack_connection( - clouds_config=self._clouds_config, cloud=self._cloud - ) as conn: - server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) - if server is not None: - return OpenstackInstance(server, self.prefix) - return None - - def delete_instance(self, instance_id: str) -> None: - """Delete a openstack instance. - - Args: - instance_id: The instance ID of the instance to delete. - """ - full_name = self.get_server_name(instance_id) - logger.info("Deleting openstack server with %s", full_name) - - with _get_openstack_connection( - clouds_config=self._clouds_config, cloud=self._cloud - ) as conn: - self._delete_instance(conn, full_name) - - def _delete_instance(self, conn: OpenstackConnection, full_name: str) -> None: - """Delete a openstack instance. - - Raises: - OpenStackError: Unable to delete OpenStack server. - - Args: - conn: The openstack connection to use. - full_name: The full name of the server. - """ - try: - server = OpenstackCloud._get_and_ensure_unique_server(conn, full_name) - if server is not None: - conn.delete_server(name_or_id=server.id) - OpenstackCloud._delete_keypair(conn, full_name) - except ( - openstack.exceptions.SDKException, - openstack.exceptions.ResourceTimeout, - ) as err: - raise OpenStackError(f"Failed to remove openstack runner {full_name}") from err - - def get_ssh_connection(self, instance: OpenstackInstance) -> SSHConnection: - """Get SSH connection to an OpenStack instance. - - Args: - instance: The OpenStack instance to connect to. - - Raises: - SSHError: Unable to get a working SSH connection to the instance. - KeyfileError: Unable to find the keyfile to connect to the instance. - - Returns: - SSH connection object. - """ - key_path = OpenstackCloud._get_key_path(instance.server_name) - - if not key_path.exists(): - raise KeyfileError( - f"Missing keyfile for server: {instance.server_name}, key path: {key_path}" - ) - if not instance.addresses: - raise SSHError(f"No addresses found for OpenStack server {instance.server_name}") - - for ip in instance.addresses: - try: - connection = SSHConnection( - host=ip, - user="ubuntu", - connect_kwargs={"key_filename": str(key_path)}, - connect_timeout=_SSH_TIMEOUT, - ) - result = connection.run(f"echo {_TEST_STRING}", warn=True, timeout=_SSH_TIMEOUT) - if not result.ok: - logger.warning( - "SSH test connection failed, server: %s, address: %s", - instance.server_name, - ip, - ) - continue - if _TEST_STRING in result.stdout: - return connection - except (NoValidConnectionsError, TimeoutError, paramiko.ssh_exception.SSHException): - logger.warning( - "Unable to SSH into %s with address %s", - instance.server_name, - connection.host, - exc_info=True, - ) - continue - raise SSHError( - f"No connectable SSH addresses found, server: {instance.server_name}, " - f"addresses: {instance.addresses}" - ) - - def get_instances(self) -> tuple[OpenstackInstance, ...]: - """Get all OpenStack instances. - - Returns: - The OpenStack instances. - """ - logger.info("Getting all openstack servers managed by the charm") - - with _get_openstack_connection( - clouds_config=self._clouds_config, cloud=self._cloud - ) as conn: - instance_list = self._get_openstack_instances(conn) - server_names = set(server.name for server in instance_list) - - server_list = [ - OpenstackCloud._get_and_ensure_unique_server(conn, name) for name in server_names - ] - return tuple( - OpenstackInstance(server, self.prefix) - for server in server_list - if server is not None - ) - - def cleanup(self) -> None: - """Cleanup unused key files and openstack keypairs.""" - with _get_openstack_connection( - clouds_config=self._clouds_config, cloud=self._cloud - ) as conn: - instances = self._get_openstack_instances(conn) - exclude_list = [server.name for server in instances] - self._cleanup_key_files(exclude_list) - self._cleanup_openstack_keypairs(conn, exclude_list) - - def get_server_name(self, instance_id: str) -> str: - """Get server name on OpenStack. - - Args: - instance_id: ID used to identify a instance. - - Returns: - The OpenStack server name. - """ - return f"{self.prefix}-{instance_id}" - - def _cleanup_key_files(self, exclude_instances: Iterable[str]) -> None: - """Delete all SSH key files except the specified instances. - - Args: - exclude_instances: The keys of these instance will not be deleted. - """ - logger.info("Cleaning up SSH key files") - exclude_filename = set( - OpenstackCloud._get_key_path(instance) for instance in exclude_instances - ) - - total = 0 - deleted = 0 - for path in _SSH_KEY_PATH.iterdir(): - # Find key file from this application. - if path.is_file() and path.name.startswith(self.prefix) and path.name.endswith(".key"): - total += 1 - if path in exclude_filename: - continue - path.unlink() - deleted += 1 - logger.info("Found %s key files, clean up %s key files", total, deleted) - - def _cleanup_openstack_keypairs( - self, conn: OpenstackConnection, exclude_instances: Iterable[str] - ) -> None: - """Delete all OpenStack keypairs except the specified instances. - - Args: - conn: The Openstack connection instance. - exclude_instances: The keys of these instance will not be deleted. - """ - logger.info("Cleaning up openstack keypairs") - exclude_instance_set = set(exclude_instances) - keypairs = conn.list_keypairs() - for key in keypairs: - # The `name` attribute is of resource.Body type. - if key.name and str(key.name).startswith(self.prefix): - if str(key.name) in exclude_instance_set: - continue - try: - self._delete_keypair(conn, key.name) - except openstack.exceptions.SDKException: - logger.warning( - "Unable to delete OpenStack keypair associated with deleted key file %s ", - key.name, - ) - - def _get_openstack_instances(self, conn: OpenstackConnection) -> tuple[OpenstackServer, ...]: - """Get the OpenStack servers managed by this unit. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - List of OpenStack instances. - """ - return tuple( - server - for server in cast(list[OpenstackServer], conn.list_servers()) - if server.name.startswith(f"{self.prefix}-") - ) - - @staticmethod - def _get_and_ensure_unique_server( - conn: OpenstackConnection, name: str - ) -> OpenstackServer | None: - """Get the latest server of the name and ensure it is unique. - - If multiple servers with the same name are found, the latest server in creation time is - returned. Other servers is deleted. - - Args: - conn: The connection to OpenStack. - name: The name of the OpenStack name. - - Returns: - A server with the name. - """ - servers: list[OpenstackServer] = conn.search_servers(name) - - if not servers: - return None - - # 2024/08/14: The `format` arg for `strptime` is the default format. - # This is only provided to get around a bug of the function with type checking. - latest_server = reduce( - lambda a, b: ( - a - if datetime.strptime(a.created_at, "a %b %d %H:%M:%S %Y") - < datetime.strptime(b.create_at, "a %b %d %H:%M:%S %Y") - else b - ), - servers, - ) - outdated_servers = filter(lambda x: x != latest_server, servers) - for server in outdated_servers: - try: - conn.delete_server(name_or_id=server.id) - except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout): - logger.warning( - "Unable to delete server with duplicate name %s with ID %s", - name, - server.id, - stack_info=True, - ) - - return latest_server - - @staticmethod - def _get_key_path(name: str) -> Path: - """Get the filepath for storing private SSH of a runner. - - Args: - name: The name of the runner. - - Returns: - Path to reserved for the key file of the runner. - """ - return _SSH_KEY_PATH / f"{name}.key" - - @staticmethod - def _setup_keypair(conn: OpenstackConnection, name: str) -> OpenstackKeypair: - """Create OpenStack keypair. - - Args: - conn: The connection object to access OpenStack cloud. - name: The name of the keypair. - - Returns: - The OpenStack keypair. - """ - key_path = OpenstackCloud._get_key_path(name) - - if key_path.exists(): - logger.warning("Existing private key file for %s found, removing it.", name) - key_path.unlink(missing_ok=True) - - keypair = conn.create_keypair(name=name) - key_path.parent.mkdir(parents=True, exist_ok=True) - key_path.write_text(keypair.private_key) - key_path.chmod(0o400) - return keypair - - @staticmethod - def _delete_keypair(conn: OpenstackConnection, name: str) -> None: - """Delete OpenStack keypair. - - Args: - conn: The connection object to access OpenStack cloud. - name: The name of the keypair. - """ - try: - # Keypair have unique names, access by ID is not needed. - if not conn.delete_keypair(name): - logger.warning("Unable to delete keypair for %s", name) - except (openstack.exceptions.SDKException, openstack.exceptions.ResourceTimeout): - logger.warning("Unable to delete keypair for %s", name, stack_info=True) - - key_path = OpenstackCloud._get_key_path(name) - key_path.unlink(missing_ok=True) - - @staticmethod - def _ensure_security_group(conn: OpenstackConnection) -> OpenstackSecurityGroup: - """Ensure runner security group exists. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - The security group with the rules for runners. - """ - rule_exists_icmp = False - rule_exists_ssh = False - rule_exists_tmate_ssh = False - - security_group_list = conn.list_security_groups(filters={"name": _SECURITY_GROUP_NAME}) - # Pick the first security_group returned. - security_group = next(iter(security_group_list), None) - if security_group is None: - logger.info("Security group %s not found, creating it", _SECURITY_GROUP_NAME) - security_group = conn.create_security_group( - name=_SECURITY_GROUP_NAME, - description="For servers managed by the github-runner charm.", - ) - else: - existing_rules = security_group.security_group_rules - for rule in existing_rules: - if rule["protocol"] == "icmp": - logger.debug( - "Found ICMP rule in existing security group %s of ID %s", - _SECURITY_GROUP_NAME, - security_group.id, - ) - rule_exists_icmp = True - if ( - rule["protocol"] == "tcp" - and rule["port_range_min"] == rule["port_range_max"] == 22 - ): - logger.debug( - "Found SSH rule in existing security group %s of ID %s", - _SECURITY_GROUP_NAME, - security_group.id, - ) - rule_exists_ssh = True - if ( - rule["protocol"] == "tcp" - and rule["port_range_min"] == rule["port_range_max"] == 10022 - ): - logger.debug( - "Found tmate SSH rule in existing security group %s of ID %s", - _SECURITY_GROUP_NAME, - security_group.id, - ) - rule_exists_tmate_ssh = True - - if not rule_exists_icmp: - conn.create_security_group_rule( - secgroup_name_or_id=security_group.id, - protocol="icmp", - direction="ingress", - ethertype="IPv4", - ) - if not rule_exists_ssh: - conn.create_security_group_rule( - secgroup_name_or_id=security_group.id, - port_range_min="22", - port_range_max="22", - protocol="tcp", - direction="ingress", - ethertype="IPv4", - ) - if not rule_exists_tmate_ssh: - conn.create_security_group_rule( - secgroup_name_or_id=security_group.id, - port_range_min="10022", - port_range_max="10022", - protocol="tcp", - direction="egress", - ethertype="IPv4", - ) - return security_group diff --git a/src/openstack_cloud/openstack_manager.py b/src/openstack_cloud/openstack_manager.py deleted file mode 100644 index 379d2ae4c..000000000 --- a/src/openstack_cloud/openstack_manager.py +++ /dev/null @@ -1,1598 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -# TODO: 2024-04-11 The module contains too many lines which are scheduled for refactoring. -# pylint: disable=too-many-lines - -# TODO: 2024-04-22 The module contains duplicate code which is scheduled for refactoring. -# Lines related to issuing metrics are duplicated: -# ==openstack_cloud.openstack_manager:[1320:1337] -# ==runner_manager:[383:413] -# ==openstack_cloud.openstack_manager:[1283:1314] -# ==runner_manager:[339:368] - -# pylint: disable=duplicate-code - -"""Module for handling interactions with OpenStack.""" -import logging -import secrets -import shutil -import time -from contextlib import contextmanager -from dataclasses import dataclass -from datetime import datetime -from multiprocessing import Pool -from pathlib import Path -from typing import Iterable, Iterator, Literal, Optional, cast - -import invoke -import jinja2 -import openstack -import openstack.connection -import openstack.exceptions -import openstack.image.v2.image -import paramiko -from fabric import Connection as SSHConnection -from openstack.compute.v2.server import Server -from openstack.connection import Connection as OpenstackConnection -from openstack.exceptions import SDKException -from openstack.network.v2.security_group import SecurityGroup -from paramiko.ssh_exception import NoValidConnectionsError - -import reactive.runner_manager as reactive_runner_manager -from charm_state import CharmState, GithubOrg, ProxyConfig, SSHDebugConnection -from errors import ( - CreateMetricsStorageError, - GetMetricsStorageError, - GithubApiError, - GithubClientError, - GithubMetricsError, - IssueMetricEventError, - OpenStackError, - RunnerCreateError, - RunnerStartError, -) -from github_client import GithubClient -from github_type import GitHubRunnerStatus, SelfHostedRunner -from metrics import events as metric_events -from metrics import github as github_metrics -from metrics import runner as runner_metrics -from metrics import storage as metrics_storage -from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME -from repo_policy_compliance_client import RepoPolicyComplianceClient -from runner_manager import IssuedMetricEventsStats -from runner_manager_type import FlushMode, OpenstackRunnerManagerConfig -from runner_type import GithubPath, RunnerGithubInfo, RunnerNameByHealth -from utilities import retry, set_env_var - -logger = logging.getLogger(__name__) - -# Update the version when the security group rules are not backward compatible. -SECURITY_GROUP_NAME = "github-runner-v1" -BUILD_OPENSTACK_IMAGE_SCRIPT_FILENAME = "scripts/build-openstack-image.sh" -_SSH_KEY_PATH = Path("/home/ubuntu/.ssh") -_CONFIG_SCRIPT_PATH = Path("/home/ubuntu/actions-runner/config.sh") - -RUNNER_APPLICATION = Path("/home/ubuntu/actions-runner") -METRICS_EXCHANGE_PATH = Path("/home/ubuntu/metrics-exchange") -PRE_JOB_SCRIPT = RUNNER_APPLICATION / "pre-job.sh" -MAX_METRICS_FILE_SIZE = 1024 - -RUNNER_STARTUP_PROCESS = "/home/ubuntu/actions-runner/run.sh" -RUNNER_LISTENER_PROCESS = "Runner.Listener" -RUNNER_WORKER_PROCESS = "Runner.Worker" -CREATE_SERVER_TIMEOUT = 5 * 60 - - -class _PullFileError(Exception): - """Represents an error while pulling a file from the runner instance.""" - - def __init__(self, reason: str): - """Construct PullFileError object. - - Args: - reason: The reason for the error. - """ - super().__init__(reason) - - -class _SSHError(Exception): - """Represents an error while interacting with SSH.""" - - def __init__(self, reason: str): - """Construct SSHErrors object. - - Args: - reason: The reason for the error. - """ - super().__init__(reason) - - -@dataclass -class InstanceConfig: - """The configuration values for creating a single runner instance. - - Attributes: - github_path: The GitHub repo/org path to register the runner. - image_id: The Openstack image id to use to boot the instance with. - labels: The runner instance labels. - name: Name of the image to launch the GitHub runner instance with. - registration_token: Token for registering the runner on GitHub. - """ - - github_path: GithubPath - image_id: str - labels: Iterable[str] - name: str - registration_token: str - - -SupportedCloudImageArch = Literal["amd64", "arm64"] - - -@dataclass -class _CloudInitUserData: - """Dataclass to hold cloud init userdata. - - Attributes: - instance_config: The configuration values for Openstack instance to launch. - runner_env: The contents of .env to source when launching Github runner. - pre_job_contents: The contents of pre-job script to run before starting the job. - proxies: Proxy values to enable on the Github runner. - dockerhub_mirror: URL to dockerhub mirror. - """ - - instance_config: InstanceConfig - runner_env: str - pre_job_contents: str - dockerhub_mirror: Optional[str] = None - proxies: Optional[ProxyConfig] = None - - -@contextmanager -def _create_connection(cloud_config: dict[str, dict]) -> Iterator[openstack.connection.Connection]: - """Create a connection context managed object, to be used within with statements. - - This method should be called with a valid cloud_config. See _validate_cloud_config. - Also, this method assumes that the clouds.yaml exists on ~/.config/openstack/clouds.yaml. - See charm_state.py _write_openstack_config_to_disk. - - Args: - cloud_config: The configuration in clouds.yaml format to apply. - - Raises: - OpenStackError: if the credentials provided is not authorized. - - Yields: - An openstack.connection.Connection object. - """ - clouds = list(cloud_config["clouds"].keys()) - if len(clouds) > 1: - logger.warning("Multiple clouds defined in clouds.yaml. Using the first one to connect.") - cloud_name = clouds[0] - - # api documents that keystoneauth1.exceptions.MissingRequiredOptions can be raised but - # I could not reproduce it. Therefore, no catch here for such exception. - try: - with openstack.connect(cloud=cloud_name) as conn: - conn.authorize() - yield conn - # pylint thinks this isn't an exception, but does inherit from Exception class. - except openstack.exceptions.HttpException as exc: # pylint: disable=bad-exception-cause - logger.exception("OpenStack API call failure") - raise OpenStackError("Failed OpenStack API call") from exc - - -# Disable too many arguments, as they are needed to create the dataclass. -def create_instance_config( # pylint: disable=too-many-arguments - app_name: str, - unit_num: int, - image_id: str, - path: GithubPath, - labels: Iterable[str], - registration_token: str, -) -> InstanceConfig: - """Create an instance config from charm data. - - Args: - app_name: The juju application name. - unit_num: The juju unit number. - image_id: The openstack image id to create the instance with. - path: Github organisation or repository path. - labels: Addition labels for the runner. - registration_token: The Github runner registration token. See \ - https://docs.github.com/en/rest/actions/self-hosted-runners?apiVersion=2022-11-28#create-a-registration-token-for-a-repository - - Returns: - Instance configuration created. - """ - suffix = secrets.token_hex(12) - return InstanceConfig( - github_path=path, - image_id=image_id, - labels=labels, - name=f"{app_name}-{unit_num}-{suffix}", - registration_token=registration_token, - ) - - -def _generate_runner_env( - templates_env: jinja2.Environment, - dockerhub_mirror: Optional[str] = None, - ssh_debug_connections: list[SSHDebugConnection] | None = None, -) -> str: - """Generate Github runner .env file contents. - - Proxy configuration are handled by aproxy. - - Args: - templates_env: The jinja template environment. - dockerhub_mirror: The url to Dockerhub to reduce rate limiting. - ssh_debug_connections: Tmate SSH debug connection information to load as environment vars. - - Returns: - The .env contents to be loaded by Github runner. - """ - return templates_env.get_template("env.j2").render( - pre_job_script=str(PRE_JOB_SCRIPT), - dockerhub_mirror=dockerhub_mirror or "", - ssh_debug_info=(secrets.choice(ssh_debug_connections) if ssh_debug_connections else None), - ) - - -def _generate_cloud_init_userdata( - templates_env: jinja2.Environment, - cloud_init_userdata: _CloudInitUserData, -) -> str: - """Generate cloud init userdata to launch at startup. - - Args: - templates_env: The jinja template environment. - cloud_init_userdata: The dataclass containing the cloud init userdata. - - Returns: - The cloud init userdata script. - """ - runner_group = None - instance_config = cloud_init_userdata.instance_config - proxies = cloud_init_userdata.proxies - - if isinstance(instance_config.github_path, GithubOrg): - runner_group = instance_config.github_path.group - - aproxy_address = proxies.aproxy_address if proxies is not None else None - return templates_env.get_template("openstack-userdata.sh.j2").render( - github_url=f"https://github.com/{instance_config.github_path.path()}", - runner_group=runner_group, - token=instance_config.registration_token, - instance_labels=",".join(instance_config.labels), - instance_name=instance_config.name, - env_contents=cloud_init_userdata.runner_env, - pre_job_contents=cloud_init_userdata.pre_job_contents, - metrics_exchange_path=str(METRICS_EXCHANGE_PATH), - aproxy_address=aproxy_address, - dockerhub_mirror=cloud_init_userdata.dockerhub_mirror, - ) - - -class GithubRunnerRemoveError(Exception): - """Represents an error removing registered runner from Github.""" - - -_INSTANCE_STATUS_SHUTOFF = "SHUTOFF" -_INSTANCE_STATUS_ERROR = "ERROR" -_INSTANCE_STATUS_ACTIVE = "ACTIVE" -_INSTANCE_STATUS_BUILDING = "BUILDING" - - -class OpenstackRunnerManager: - """Runner manager for OpenStack-based instances. - - Attributes: - app_name: The juju application name. - unit_num: The juju unit number. - instance_name: Prefix of the name for the set of runners. - """ - - def __init__( - self, - app_name: str, - unit_num: int, - openstack_runner_manager_config: OpenstackRunnerManagerConfig, - cloud_config: dict[str, dict], - ): - """Construct OpenstackRunnerManager object. - - Args: - app_name: The juju application name. - unit_num: The juju unit number. - openstack_runner_manager_config: Configurations related to runner manager. - cloud_config: The openstack clouds.yaml in dict format. - """ - # Setting the env var to this process and any child process spawned. - proxies = openstack_runner_manager_config.charm_state.proxy_config - if no_proxy := proxies.no_proxy: - set_env_var("NO_PROXY", no_proxy) - if http_proxy := proxies.http: - set_env_var("HTTP_PROXY", http_proxy) - if https_proxy := proxies.https: - set_env_var("HTTPS_PROXY", https_proxy) - - self.app_name = app_name - self.unit_num = unit_num - self.instance_name = f"{app_name}-{unit_num}" - self._config = openstack_runner_manager_config - self._cloud_config = cloud_config - self._github = GithubClient(token=self._config.token) - - def reconcile(self, quantity: int) -> int: - """Reconcile the quantity of runners. - - Args: - quantity: The number of intended runners. - - Returns: - The change in number of runners. - """ - if self._config.reactive_config: - logger.info("Reactive configuration detected, going into experimental reactive mode.") - return self._reconcile_reactive(quantity) - - start_ts = time.time() - try: - delta = self._reconcile_runners(quantity) - finally: - end_ts = time.time() - self._issue_reconciliation_metrics( - reconciliation_start_ts=start_ts, reconciliation_end_ts=end_ts - ) - - return delta - - def _reconcile_reactive(self, quantity: int) -> int: - """Reconcile runners reactively. - - Args: - quantity: Number of intended runners. - - Returns: - The difference between intended runners and actual runners. In reactive mode - this number is never negative as additional processes should terminate after a timeout. - """ - logger.info("Reactive mode is experimental and not yet fully implemented.") - return reactive_runner_manager.reconcile( - quantity=quantity, mq_uri=self._config.reactive_config.mq_uri, queue_name=self.app_name - ) - - def _reconcile_runners(self, quantity: int) -> int: - """Reconcile the number of runners. - - Args: - quantity: The number of intended runners. - - Returns: - The change in number of runners. - """ - with _create_connection(self._cloud_config) as conn: - runner_by_health = self._get_openstack_runner_status(conn) - logger.info( - "Found %s healthy runner and %s unhealthy runner", - len(runner_by_health.healthy), - len(runner_by_health.unhealthy), - ) - logger.debug("Healthy runner: %s", runner_by_health.healthy) - logger.debug("Unhealthy runner: %s", runner_by_health.unhealthy) - remove_token = self._github.get_runner_remove_token(path=self._config.path) - - self._clean_up_runners( - conn=conn, runner_by_health=runner_by_health, remove_token=remove_token - ) - - delta = self._scale( - quantity=quantity, - conn=conn, - runner_by_health=runner_by_health, - remove_token=remove_token, - ) - return delta - - def get_github_runner_info(self) -> tuple[RunnerGithubInfo, ...]: - """Get information on GitHub for the runners. - - Returns: - Collection of runner GitHub information. - """ - remote_runners_list: list[SelfHostedRunner] = self._github.get_runner_github_info( - self._config.path - ) - logger.debug("List of runners found on GitHub:%s", remote_runners_list) - return tuple( - RunnerGithubInfo( - runner["name"], - runner["id"], - runner["status"] == GitHubRunnerStatus.ONLINE, - runner["busy"], - ) - for runner in remote_runners_list - if runner["name"].startswith(f"{self.instance_name}-") - ) - - def _get_openstack_runner_status(self, conn: OpenstackConnection) -> RunnerNameByHealth: - """Get status on OpenStack of each runner. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - Runner status grouped by health. - """ - healthy_runner = [] - unhealthy_runner = [] - openstack_instances = self._get_openstack_instances(conn) - - logger.debug("Found openstack instances: %s", openstack_instances) - - for instance in openstack_instances: - if not OpenstackRunnerManager._health_check(conn=conn, server_name=instance.name): - unhealthy_runner.append(instance.name) - else: - healthy_runner.append(instance.name) - - return RunnerNameByHealth(healthy=tuple(healthy_runner), unhealthy=tuple(unhealthy_runner)) - - def _get_openstack_instances(self, conn: OpenstackConnection) -> list[Server]: - """Get the OpenStack servers managed by this unit. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - List of OpenStack instances. - """ - return [ - instance - for instance in cast(list[Server], conn.list_servers()) - if instance.name.startswith(f"{self.instance_name}-") - ] - - @staticmethod - def _health_check( - conn: OpenstackConnection, - server_name: str, - startup: bool = False, - ) -> bool: - """Health check a server instance. - - A healthy server is defined as: - 1. Openstack instance status is ACTIVE or BUILDING. - 2. Openstack instance status is in BUILDING less than CREATE_SERVER_TIMEOUT seconds. - 3. Runner.Worker exists (running a job). - 4. Runner.Listener exists (waiting for job). - 5. GitHub runner status is Idle or Active. - - An undetermined server is marked as healthy when: - 1. SSH fails - could be a transient network error. - 2. The Runner.* processes do not exist. Mark healthy for now to gather data. This is - subject to change to unhealthy once enough data has been gathered. - - Args: - conn: The Openstack connection instance. - server_name: The name of the OpenStack server to health check. - startup: Check only whether the startup is successful. - - Returns: - Whether the instance is healthy. - """ - server: Server | None = conn.get_server(name_or_id=server_name) - if not server: - return False - if server.status == (_INSTANCE_STATUS_SHUTOFF, _INSTANCE_STATUS_ERROR): - return False - if server.status not in (_INSTANCE_STATUS_ACTIVE, _INSTANCE_STATUS_BUILDING): - return False - created_at = datetime.strptime(server.created_at, "%Y-%m-%dT%H:%M:%SZ") - current_time = datetime.now(created_at.tzinfo) - elapsed_min = (created_at - current_time).total_seconds() - if server.status == _INSTANCE_STATUS_BUILDING: - return elapsed_min < CREATE_SERVER_TIMEOUT - try: - return OpenstackRunnerManager._ssh_health_check( - conn=conn, server_name=server_name, startup=startup - ) - except _SSHError: - logger.warning("Health check failed, unable to SSH into server: %s", server_name) - return False - - @staticmethod - @retry(tries=3, delay=5, max_delay=60, backoff=2, local_logger=logger) - def _ssh_health_check(conn: OpenstackConnection, server_name: str, startup: bool) -> bool: - """Use SSH to check whether runner application is running. - - A healthy runner is defined as: - 1. SSH connection can be established. - 2. Runner.Worker exists (running a job). - 3. Runner.Listener exists (waiting for job). - - Args: - conn: The Openstack connection instance. - server_name: The openstack server instance to check connections. - startup: Check only whether the startup is successful. - - Raises: - _SSHError: if there was an error SSH-ing into the machine or with the SSH command. - - Returns: - Whether the runner application is running. - """ - try: - ssh_conn = OpenstackRunnerManager._get_ssh_connection( - conn=conn, server_name=server_name - ) - except _SSHError as exc: - logger.error("[ALERT]: Unable to SSH to server: %s, reason: %s", server_name, str(exc)) - raise - - result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) - logger.debug("Output of `ps aux` on %s stderr: %s", server_name, result.stderr) - if not result.ok: - logger.warning("List all process command failed on %s.", server_name) - raise _SSHError(f"List process command failed on {server_name}.") - if RUNNER_STARTUP_PROCESS not in result.stdout: - logger.warning("No startup process found on server %s.", server_name) - raise _SSHError(f"Runner not yet started on {server_name}.") - - logger.info("Runner process found to be healthy on %s", server_name) - if startup: - return True - - if RUNNER_WORKER_PROCESS in result.stdout or RUNNER_LISTENER_PROCESS in result.stdout: - return True - - return False - - @staticmethod - @retry(tries=3, delay=5, max_delay=60, backoff=2, local_logger=logger) - def _get_ssh_connection( - conn: OpenstackConnection, server_name: str, timeout: int = 30 - ) -> SSHConnection: - """Get a valid ssh connection within a network for a given openstack instance. - - The SSH connection will attempt to establish connection until the timeout configured. - - Args: - conn: The Openstack connection instance. - server_name: The Openstack server instance name. - timeout: Timeout in seconds to attempt connection to each available server address. - - Raises: - _SSHError: If there was an error getting a valid SSH connection. - - Returns: - An SSH connection to OpenStack server instance. - """ - server: Server | None = conn.get_server(name_or_id=server_name) - if server is None: - raise _SSHError(f"Server gone while trying to get SSH connection: {server_name}.") - if not server.key_name: - raise _SSHError( - f"Unable to create SSH connection, no valid keypair found for {server.name}" - ) - key_path = OpenstackRunnerManager._get_key_path(server.name) - if not key_path.exists(): - raise _SSHError(f"Missing keyfile for server: {server.name}, key path: {key_path}") - network_address_list = server.addresses.values() - if not network_address_list: - raise _SSHError(f"No addresses found for OpenStack server {server.name}") - - server_addresses: list[str] = [ - address["addr"] - for network_addresses in network_address_list - for address in network_addresses - ] - for ip in server_addresses: - try: - connection = SSHConnection( - host=ip, - user="ubuntu", - connect_kwargs={"key_filename": str(key_path)}, - connect_timeout=timeout, - ) - result = connection.run("echo hello world", warn=True, timeout=timeout) - if not result.ok: - logger.warning( - "SSH test connection failed, server: %s, address: %s", server.name, ip - ) - continue - if "hello world" in result.stdout: - return connection - except (NoValidConnectionsError, TimeoutError, paramiko.ssh_exception.SSHException): - logger.warning( - "Unable to SSH into %s with address %s", - server.name, - connection.host, - exc_info=True, - ) - continue - raise _SSHError( - f"No connectable SSH addresses found, server: {server.name}, " - f"addresses: {server_addresses}" - ) - - @staticmethod - def _get_key_path(name: str) -> Path: - """Get the filepath for storing private SSH of a runner. - - Args: - name: The name of the runner. - - Returns: - Path to reserved for the key file of the runner. - """ - return _SSH_KEY_PATH / f"runner-{name}.key" - - @dataclass - class _CreateRunnerArgs: - """Arguments for _create_runner method. - - Attributes: - app_name: The juju application name. - cloud_config: The clouds.yaml containing the OpenStack credentials. The first cloud - in the file will be used. - config: Configurations related to runner manager. - registration_token: Token for registering the runner on GitHub. - unit_num: The juju unit number. - """ - - app_name: str - cloud_config: dict[str, dict] - config: OpenstackRunnerManagerConfig - registration_token: str - unit_num: int - - @staticmethod - def _create_runner(args: _CreateRunnerArgs) -> None: - """Create a runner on OpenStack cloud. - - Arguments are gathered into a dataclass due to Pool.map needing one argument functions. - - Args: - args: Arguments of the method. - - Raises: - RunnerCreateError: Unable to create the OpenStack runner. - """ - ts_now = time.time() - environment = jinja2.Environment( - loader=jinja2.FileSystemLoader("templates"), autoescape=True - ) - - env_contents = _generate_runner_env( - templates_env=environment, - dockerhub_mirror=args.config.dockerhub_mirror, - ssh_debug_connections=args.config.charm_state.ssh_debug_connections, - ) - - pre_job_contents = OpenstackRunnerManager._render_pre_job_contents( - charm_state=args.config.charm_state, templates_env=environment - ) - - instance_config = create_instance_config( - args.app_name, - args.unit_num, - args.config.image, - args.config.path, - args.config.labels, - args.registration_token, - ) - cloud_user_data = _CloudInitUserData( - instance_config=instance_config, - runner_env=env_contents, - pre_job_contents=pre_job_contents, - dockerhub_mirror=args.config.dockerhub_mirror, - proxies=args.config.charm_state.proxy_config, - ) - cloud_userdata_str = _generate_cloud_init_userdata( - templates_env=environment, - cloud_init_userdata=cloud_user_data, - ) - - with _create_connection(cloud_config=args.cloud_config) as conn: - runner_security_group = OpenstackRunnerManager._ensure_security_group(conn) - OpenstackRunnerManager._setup_runner_keypair(conn, instance_config.name) - - logger.info("Creating runner %s", instance_config.name) - try: - instance = conn.create_server( - name=instance_config.name, - image=instance_config.image_id, - key_name=instance_config.name, - flavor=args.config.flavor, - network=args.config.network, - security_groups=[runner_security_group["id"]], - userdata=cloud_userdata_str, - auto_ip=False, - timeout=CREATE_SERVER_TIMEOUT, - wait=True, - ) - except openstack.exceptions.ResourceTimeout as err: - logger.exception("Timeout creating OpenStack runner %s", instance_config.name) - try: - logger.info( - "Attempting to remove OpenStack runner %s that timeout on creation", - instance_config.name, - ) - conn.delete_server(name_or_id=instance_config.name, wait=True) - try: - conn.delete_keypair(instance_config.name) - except openstack.exceptions.SDKException: - logger.exception( - "Unable to delete OpenStack keypair %s", instance_config.name - ) - OpenstackRunnerManager._get_key_path(instance_config.name).unlink( - missing_ok=True - ) - except openstack.exceptions.SDKException: - logger.exception( - "Cleanup of creation failure runner %s has failed", instance_config.name - ) - # Reconcile will attempt to cleanup again prior to spawning new runners. - raise RunnerCreateError( - f"Timeout creating OpenStack runner {instance_config.name}" - ) from err - except openstack.exceptions.SDKException as err: - logger.exception("Failed to create OpenStack runner %s", instance_config.name) - raise RunnerCreateError( - f"Failed to create OpenStack runner {instance_config.name}" - ) from err - - logger.info("Waiting runner %s to come online", instance_config.name) - OpenstackRunnerManager._wait_until_runner_process_running(conn, instance.name) - logger.info("Finished creating runner %s", instance_config.name) - ts_after = time.time() - OpenstackRunnerManager._issue_runner_installed_metric( - app_name=args.app_name, - instance_config=instance_config, - install_end_ts=ts_after, - install_start_ts=ts_now, - ) - - @staticmethod - def _render_pre_job_contents( - charm_state: CharmState, templates_env: jinja2.Environment - ) -> str: - """Render the pre-job script contents. - - Args: - charm_state: The charm state object. - templates_env: The jinja template environment. - - Returns: - The rendered pre-job script contents. - """ - pre_job_contents_dict = { - "issue_metrics": True, - "metrics_exchange_path": str(METRICS_EXCHANGE_PATH), - "do_repo_policy_check": False, - } - if repo_policy_config := charm_state.charm_config.repo_policy_compliance: - repo_policy_client = RepoPolicyComplianceClient( - url=repo_policy_config.url, charm_token=repo_policy_config.token - ) - pre_job_contents_dict.update( - { - "repo_policy_base_url": repo_policy_client.base_url, - "repo_policy_one_time_token": repo_policy_client.get_one_time_token(), - "do_repo_policy_check": True, - } - ) - pre_job_contents = templates_env.get_template("pre-job.j2").render(pre_job_contents_dict) - return pre_job_contents - - @staticmethod - def _ensure_security_group(conn: OpenstackConnection) -> SecurityGroup: - """Ensure runner security group exists. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - The security group with the rules for runners. - """ - rule_exists_icmp = False - rule_exists_ssh = False - rule_exists_tmate_ssh = False - - security_group_list = conn.list_security_groups(filters={"name": SECURITY_GROUP_NAME}) - # Pick the first security_group returned. - security_group = next(iter(security_group_list), None) - - if security_group is None: - logger.info("Security group %s not found, creating it", SECURITY_GROUP_NAME) - security_group = conn.create_security_group( - name=SECURITY_GROUP_NAME, - description="For servers managed by the github-runner charm.", - ) - else: - existing_rules = security_group["security_group_rules"] - for rule in existing_rules: - if rule["protocol"] == "icmp": - logger.debug( - "Found ICMP rule in existing security group %s of ID %s", - SECURITY_GROUP_NAME, - security_group["id"], - ) - rule_exists_icmp = True - if ( - rule["protocol"] == "tcp" - and rule["port_range_min"] == rule["port_range_max"] == 22 - ): - logger.debug( - "Found SSH rule in existing security group %s of ID %s", - SECURITY_GROUP_NAME, - security_group["id"], - ) - rule_exists_ssh = True - if ( - rule["protocol"] == "tcp" - and rule["port_range_min"] == rule["port_range_max"] == 10022 - ): - logger.debug( - "Found tmate SSH rule in existing security group %s of ID %s", - SECURITY_GROUP_NAME, - security_group["id"], - ) - rule_exists_tmate_ssh = True - - if not rule_exists_icmp: - conn.create_security_group_rule( - secgroup_name_or_id=security_group["id"], - protocol="icmp", - direction="ingress", - ethertype="IPv4", - ) - if not rule_exists_ssh: - conn.create_security_group_rule( - secgroup_name_or_id=security_group["id"], - port_range_min="22", - port_range_max="22", - protocol="tcp", - direction="ingress", - ethertype="IPv4", - ) - if not rule_exists_tmate_ssh: - conn.create_security_group_rule( - secgroup_name_or_id=security_group["id"], - port_range_min="10022", - port_range_max="10022", - protocol="tcp", - direction="egress", - ethertype="IPv4", - ) - return security_group - - @staticmethod - def _setup_runner_keypair(conn: OpenstackConnection, name: str) -> None: - """Set up the SSH keypair for a runner. - - Args: - conn: The connection object to access OpenStack cloud. - name: The name of the runner. - """ - private_key_path = OpenstackRunnerManager._get_key_path(name) - - if private_key_path.exists(): - logger.warning("Existing private key file for %s found, removing it.", name) - private_key_path.unlink() - - keypair = conn.create_keypair(name=name) - private_key_path.write_text(keypair.private_key) - shutil.chown(private_key_path, user="ubuntu", group="ubuntu") - private_key_path.chmod(0o400) - - @retry(tries=10, delay=60, local_logger=logger) - @staticmethod - def _wait_until_runner_process_running(conn: OpenstackConnection, instance_name: str) -> None: - """Wait until the runner process is running. - - The waiting to done by the retry declarator. - - Args: - conn: The openstack connection instance. - instance_name: The name of the instance to wait on. - - Raises: - RunnerStartError: Unable perform health check of the runner application. - """ - try: - if not OpenstackRunnerManager._health_check( - conn=conn, server_name=instance_name, startup=True - ): - raise RunnerStartError( - ( - "Unable to find running process of runner application on openstack runner " - f"{instance_name}" - ) - ) - except TimeoutError as err: - raise RunnerStartError( - f"Unable to connect to openstack runner {instance_name}" - ) from err - - @staticmethod - def _issue_runner_installed_metric( - app_name: str, - instance_config: InstanceConfig, - install_start_ts: float, - install_end_ts: float, - ) -> None: - """Issue RunnerInstalled metric. - - Args: - app_name: The juju application name. - instance_config: The configuration values for Openstack instance. - install_start_ts: The timestamp when the installation started. - install_end_ts: The timestamp when the installation ended. - """ - try: - metric_events.issue_event( - event=metric_events.RunnerInstalled( - timestamp=install_start_ts, - flavor=app_name, - duration=install_end_ts - install_start_ts, - ), - ) - except IssueMetricEventError: - logger.exception("Failed to issue RunnerInstalled metric") - try: - storage = metrics_storage.create(instance_config.name) - except CreateMetricsStorageError: - logger.exception( - "Failed to create metrics storage for runner %s, " - "will not be able to issue all metrics.", - instance_config.name, - ) - else: - try: - (storage.path / RUNNER_INSTALLED_TS_FILE_NAME).write_text( - str(install_end_ts), encoding="utf-8" - ) - except FileNotFoundError: - logger.exception( - "Failed to write runner-installed.timestamp into metrics storage " - "for runner %s, will not be able to issue all metrics.", - instance_config.name, - ) - - def _remove_runners( - self, - conn: OpenstackConnection, - instance_names: Iterable[str], - remove_token: str | None = None, - num_to_remove: int | float | None = None, - ) -> None: - """Delete runners on Openstack. - - Removes the registered runner from Github if remove_token is provided. - - Args: - conn: The Openstack connection instance. - instance_names: The Openstack server names to delete. - remove_token: The GitHub runner remove token. - num_to_remove: Remove a specified number of runners. Remove all if None. - """ - if num_to_remove is None: - num_to_remove = float("inf") - - name_to_github_id = { - runner["name"]: runner["id"] - for runner in self._github.get_runner_github_info(self._config.path) - } - for instance_name in instance_names: - if num_to_remove < 1: - break - - github_id = name_to_github_id.get(instance_name, None) - self._remove_one_runner(conn, instance_name, github_id, remove_token) - - # Attempt to delete the keys. This is place at the end of deletion, so we can access - # the instances that failed to delete on previous tries. - try: - conn.delete_keypair(instance_name) - except openstack.exceptions.SDKException: - logger.exception("Unable to delete OpenStack keypair %s", instance_name) - OpenstackRunnerManager._get_key_path(instance_name).unlink(missing_ok=True) - num_to_remove -= 1 - - def _remove_one_runner( - self, - conn: OpenstackConnection, - instance_name: str, - github_id: int | None = None, - remove_token: str | None = None, - ) -> None: - """Remove one OpenStack runner. - - Args: - conn: The Openstack connection instance. - instance_name: The Openstack server name to delete. - github_id: The runner id on GitHub. - remove_token: The GitHub runner remove token. - """ - logger.info("Attempting to remove OpenStack runner %s", instance_name) - - server: Server | None = conn.get_server(name_or_id=instance_name) - - if server is not None: - logger.info( - "Pulling metrics and deleting server for OpenStack runner %s", instance_name - ) - self._pull_metrics(conn=conn, instance_name=instance_name) - self._remove_openstack_runner(conn, server, remove_token) - else: - logger.info( - "Not found server for OpenStack runner %s marked for deletion", instance_name - ) - - if github_id is not None: - try: - self._github.delete_runner(self._config.path, github_id) - except GithubClientError as exc: - logger.warning("Failed to remove runner from Github %s, %s", instance_name, exc) - # TODO: 2024-04-23: The broad except clause is for logging purposes. - # Will be removed in future versions. - except Exception: # pylint: disable=broad-exception-caught - logger.critical( - "Found unexpected exception, please contact the developers", exc_info=True - ) - - def _pull_metrics(self, conn: OpenstackConnection, instance_name: str) -> None: - """Pull metrics from the runner into the respective storage for the runner. - - Args: - conn: The Openstack connection instance. - instance_name: The Openstack server name. - """ - try: - storage = metrics_storage.get(instance_name) - except GetMetricsStorageError: - logger.exception( - "Failed to get shared metrics storage for runner %s, " - "will not be able to issue all metrics.", - instance_name, - ) - return - - try: - ssh_conn = self._get_ssh_connection(conn=conn, server_name=instance_name) - except _SSHError as exc: - logger.info("Failed to pull metrics for %s: %s", instance_name, exc) - return - - try: - self._pull_file( - ssh_conn=ssh_conn, - remote_path=str(METRICS_EXCHANGE_PATH / "pre-job-metrics.json"), - local_path=str(storage.path / "pre-job-metrics.json"), - max_size=MAX_METRICS_FILE_SIZE, - ) - self._pull_file( - ssh_conn=ssh_conn, - remote_path=str(METRICS_EXCHANGE_PATH / "post-job-metrics.json"), - local_path=str(storage.path / "post-job-metrics.json"), - max_size=MAX_METRICS_FILE_SIZE, - ) - return - except _PullFileError as exc: - logger.warning( - "Failed to pull metrics for %s: %s . Will not be able to issue all metrics", - instance_name, - exc, - ) - return - - def _pull_file( - self, ssh_conn: SSHConnection, remote_path: str, local_path: str, max_size: int - ) -> None: - """Pull file from the runner instance. - - Args: - ssh_conn: The SSH connection instance. - remote_path: The file path on the runner instance. - local_path: The local path to store the file. - max_size: If the file is larger than this, it will not be pulled. - - Raises: - _PullFileError: Unable to pull the file from the runner instance. - _SSHError: Issue with SSH connection. - """ - try: - result = ssh_conn.run(f"stat -c %s {remote_path}", warn=True) - except (NoValidConnectionsError, TimeoutError, paramiko.ssh_exception.SSHException) as exc: - raise _SSHError(reason=f"Unable to SSH into {ssh_conn.host}") from exc - if not result.ok: - logger.warning( - ( - "Unable to get file size of %s on instance %s, " - "exit code: %s, stdout: %s, stderr: %s" - ), - remote_path, - ssh_conn.host, - result.return_code, - result.stdout, - result.stderr, - ) - raise _PullFileError(reason=f"Unable to get file size of {remote_path}") - - stdout = result.stdout - try: - stdout.strip() - size = int(stdout) - if size > max_size: - raise _PullFileError( - reason=f"File size of {remote_path} too large {size} > {max_size}" - ) - except ValueError as exc: - raise _PullFileError(reason=f"Invalid file size for {remote_path}: {stdout}") from exc - - try: - ssh_conn.get(remote=remote_path, local=local_path) - except (NoValidConnectionsError, TimeoutError, paramiko.ssh_exception.SSHException) as exc: - raise _SSHError(reason=f"Unable to SSH into {ssh_conn.host}") from exc - except OSError as exc: - raise _PullFileError(reason=f"Unable to retrieve file {remote_path}") from exc - - def _remove_openstack_runner( - self, - conn: OpenstackConnection, - server: Server, - remove_token: str | None = None, - ) -> None: - """Remove a OpenStack server hosting the GitHub runner application. - - Args: - conn: The Openstack connection instance. - server: The Openstack server. - remove_token: The GitHub runner remove token. - """ - try: - self._run_github_removal_script(conn=conn, server=server, remove_token=remove_token) - except (TimeoutError, invoke.exceptions.UnexpectedExit, GithubRunnerRemoveError): - logger.warning( - "Failed to run runner removal script for %s", server.name, exc_info=True - ) - # TODO: 2024-04-23: The broad except clause is for logging purposes. - # Will be removed in future versions. - except Exception: # pylint: disable=broad-exception-caught - logger.critical( - "Found unexpected exception, please contact the developers", exc_info=True - ) - try: - if not conn.delete_server(name_or_id=server.name, wait=True, delete_ips=True): - logger.warning("Server does not exist %s", server.name) - except SDKException as exc: - logger.error("Something wrong deleting the server %s, %s", server.name, exc) - # TODO: 2024-04-23: The broad except clause is for logging purposes. - # Will be removed in future versions. - except Exception: # pylint: disable=broad-exception-caught - logger.critical( - "Found unexpected exception, please contact the developers", exc_info=True - ) - - def _run_github_removal_script( - self, conn: OpenstackConnection, server: Server, remove_token: str | None - ) -> None: - """Run Github runner removal script. - - Args: - conn: The Openstack connection instance. - server: The Openstack server instance. - remove_token: The GitHub instance removal token. - - Raises: - GithubRunnerRemoveError: Unable to remove runner from GitHub. - """ - if not remove_token: - return - try: - ssh_conn = OpenstackRunnerManager._get_ssh_connection( - conn=conn, server_name=server.name - ) - except _SSHError as exc: - logger.error( - "Unable to run GitHub removal script, server: %s, reason: %s", - server.name, - str(exc), - ) - raise GithubRunnerRemoveError( - f"Failed to remove runner {server.name} from Github." - ) from exc - - try: - result: invoke.runners.Result = ssh_conn.run( - f"{_CONFIG_SCRIPT_PATH} remove --token {remove_token}", - warn=True, - ) - if not result.ok: - logger.warning( - ( - "Unable to run removal script on instance %s, " - "exit code: %s, stdout: %s, stderr: %s" - ), - server.name, - result.return_code, - result.stdout, - result.stderr, - ) - return - # TODO: 2024-04-23: The broad except clause is for logging purposes. - # Will be removed in future versions. - except Exception: # pylint: disable=broad-exception-caught - logger.critical( - "Found unexpected exception, please contact the developers", exc_info=True - ) - - logger.warning("Failed to run GitHub runner removal script %s", server.name) - raise GithubRunnerRemoveError(f"Failed to remove runner {server.name} from Github.") - - def _clean_up_keys_files( - self, conn: OpenstackConnection, exclude_instances: Iterable[str] - ) -> None: - """Delete all SSH key files except the specified instances. - - Args: - conn: The Openstack connection instance. - exclude_instances: The keys of these instance will not be deleted. - """ - logger.info("Cleaning up SSH key files") - exclude_filename = set( - OpenstackRunnerManager._get_key_path(instance) for instance in exclude_instances - ) - - total = 0 - deleted = 0 - for path in _SSH_KEY_PATH.iterdir(): - # Find key file from this application. - if ( - path.is_file() - and path.name.startswith(self.instance_name) - and path.name.endswith(".key") - ): - total += 1 - if path.name in exclude_filename: - continue - - keypair_name = path.name.split(".")[0] - try: - conn.delete_keypair(keypair_name) - except openstack.exceptions.SDKException: - logger.warning( - "Unable to delete OpenStack keypair associated with deleted key file %s ", - path.name, - ) - - path.unlink() - deleted += 1 - logger.info("Found %s key files, clean up %s key files", total, deleted) - - def _clean_up_openstack_keypairs( - self, conn: OpenstackConnection, exclude_instances: Iterable[str] - ) -> None: - """Delete all OpenStack keypairs except the specified instances. - - Args: - conn: The Openstack connection instance. - exclude_instances: The keys of these instance will not be deleted. - """ - logger.info("Cleaning up openstack keypairs") - keypairs = conn.list_keypairs() - for key in keypairs: - # The `name` attribute is of resource.Body type. - if key.name and str(key.name).startswith(self.instance_name): - if str(key.name) in exclude_instances: - continue - - try: - conn.delete_keypair(key.name) - except openstack.exceptions.SDKException: - logger.warning( - "Unable to delete OpenStack keypair associated with deleted key file %s ", - key.name, - ) - - def _clean_up_runners( - self, conn: OpenstackConnection, runner_by_health: RunnerNameByHealth, remove_token: str - ) -> None: - """Clean up offline or unhealthy runners. - - Args: - conn: The openstack connection instance. - runner_by_health: The runner status grouped by health. - remove_token: The GitHub runner remove token. - - """ - github_info = self.get_github_runner_info() - online_runners = [runner.runner_name for runner in github_info if runner.online] - offline_runners = [runner.runner_name for runner in github_info if not runner.online] - busy_runners = [runner.runner_name for runner in github_info if runner.busy] - logger.info( - "Found %s online and %s offline openstack runners, %s of the runners are busy", - len(online_runners), - len(offline_runners), - len(busy_runners), - ) - logger.debug("Online runner: %s", online_runners) - logger.debug("Offline runner: %s", offline_runners) - logger.debug("Busy runner: %s", busy_runners) - - healthy_runners_set = set(runner_by_health.healthy) - busy_runners_set = set(busy_runners) - busy_unhealthy_runners = set(runner_by_health.unhealthy).intersection(busy_runners_set) - if busy_unhealthy_runners: - logger.warning("Found unhealthy busy runners %s", busy_unhealthy_runners) - - # Clean up offline (SHUTOFF) runners or unhealthy (no connection/cloud-init script) - # runners. - # Possible for a healthy runner to be appear as offline for sometime as GitHub can be - # slow to update the status. - # For busy runners let GitHub decide whether the runner should be removed. - instance_to_remove = tuple( - runner - for runner in (*runner_by_health.unhealthy, *offline_runners) - if runner not in healthy_runners_set and runner not in busy_runners_set - ) - logger.debug("Removing following runners with issues %s", instance_to_remove) - self._remove_runners( - conn=conn, instance_names=instance_to_remove, remove_token=remove_token - ) - # Clean up orphan keys, e.g., If openstack instance is removed externally the key - # would not be deleted. - self._clean_up_keys_files(conn, runner_by_health.healthy) - self._clean_up_openstack_keypairs(conn, runner_by_health.healthy) - - def _scale( - self, - quantity: int, - conn: OpenstackConnection, - runner_by_health: RunnerNameByHealth, - remove_token: str, - ) -> int: - """Scale the number of runners. - - Args: - quantity: The number of intended runners. - conn: The openstack connection instance. - runner_by_health: The runner status grouped by health. - remove_token: The GitHub runner remove token. - - Returns: - The change in number of runners. - """ - # Get the number of OpenStack servers. - # This is not calculated due to there might be removal failures. - servers = self._get_openstack_instances(conn) - delta = quantity - len(servers) - registration_token = self._github.get_runner_registration_token(path=self._config.path) - - # Spawn new runners - if delta > 0: - logger.info("Creating %s OpenStack runners", delta) - args = [ - OpenstackRunnerManager._CreateRunnerArgs( - app_name=self.app_name, - config=self._config, - cloud_config=self._cloud_config, - registration_token=registration_token, - unit_num=self.unit_num, - ) - for _ in range(delta) - ] - with Pool(processes=min(delta, 10)) as pool: - pool.map( - func=OpenstackRunnerManager._create_runner, - iterable=args, - ) - - elif delta < 0: - logger.info("Removing %s OpenStack runners", delta) - self._remove_runners( - conn=conn, - instance_names=runner_by_health.healthy, - remove_token=remove_token, - num_to_remove=abs(delta), - ) - else: - logger.info("No changes to number of runners needed") - - return delta - - def _issue_reconciliation_metrics( - self, - reconciliation_start_ts: float, - reconciliation_end_ts: float, - ) -> None: - """Issue all reconciliation related metrics. - - This includes the metrics for the runners and the reconciliation metric itself. - - Args: - reconciliation_start_ts: The timestamp of when reconciliation started. - reconciliation_end_ts: The timestamp of when reconciliation ended. - """ - with _create_connection(self._cloud_config) as conn: - runner_states = self._get_openstack_runner_status(conn) - - metric_stats = self._issue_runner_metrics(conn) - self._issue_reconciliation_metric( - metric_stats=metric_stats, - reconciliation_start_ts=reconciliation_start_ts, - reconciliation_end_ts=reconciliation_end_ts, - runner_states=runner_states, - ) - - def _issue_runner_metrics(self, conn: OpenstackConnection) -> IssuedMetricEventsStats: - """Issue runner metrics. - - Args: - conn: The connection object to access OpenStack cloud. - - Returns: - The stats of issued metric events. - """ - total_stats: IssuedMetricEventsStats = {} - - try: - openstack_instances = self._get_openstack_instances(conn) - except openstack.exceptions.SDKException: - logger.exception( - "Failed to get openstack instances to ignore when extracting metrics." - " Will not issue runner metrics" - ) - return total_stats - - logger.debug( - "Found following openstack instances before extracting metrics: %s", - openstack_instances, - ) - # Don't extract metrics for instances which are still there, as it might be - # the case that the metrics have not yet been pulled - # (they get pulled right before server termination). - instance_names = {instance.name for instance in openstack_instances} - - for extracted_metrics in runner_metrics.extract( - metrics_storage_manager=metrics_storage, - runners=instance_names, - ): - try: - job_metrics = github_metrics.job( - github_client=self._github, - pre_job_metrics=extracted_metrics.pre_job, - runner_name=extracted_metrics.runner_name, - ) - except GithubMetricsError: - logger.exception("Failed to calculate job metrics") - job_metrics = None - - issued_events = runner_metrics.issue_events( - runner_metrics=extracted_metrics, - job_metrics=job_metrics, - flavor=self.app_name, - ) - for event_type in issued_events: - total_stats[event_type] = total_stats.get(event_type, 0) + 1 - return total_stats - - def _issue_reconciliation_metric( - self, - metric_stats: IssuedMetricEventsStats, - reconciliation_start_ts: float, - reconciliation_end_ts: float, - runner_states: RunnerNameByHealth, - ) -> None: - """Issue reconciliation metric. - - Args: - metric_stats: The stats of issued metric events. - reconciliation_start_ts: The timestamp of when reconciliation started. - reconciliation_end_ts: The timestamp of when reconciliation ended. - runner_states: The states of the runners. - """ - try: - github_info = self.get_github_runner_info() - except GithubApiError: - logger.exception( - "Failed to retrieve github info for reconciliation metric. " - "Will not issue reconciliation metric." - ) - return - - online_runners = [runner for runner in github_info if runner.online] - offline_runner_names = {runner.runner_name for runner in github_info if not runner.online} - active_runner_names = {runner.runner_name for runner in online_runners if runner.busy} - healthy_runners = set(runner_states.healthy) - - active_count = len(active_runner_names) - idle_online_count = len(online_runners) - active_count - idle_offline_count = len((offline_runner_names & healthy_runners) - active_runner_names) - - try: - metric_events.issue_event( - event=metric_events.Reconciliation( - timestamp=time.time(), - flavor=self.app_name, - crashed_runners=metric_stats.get(metric_events.RunnerStart, 0) - - metric_stats.get(metric_events.RunnerStop, 0), - idle_runners=idle_online_count + idle_offline_count, - duration=reconciliation_end_ts - reconciliation_start_ts, - ) - ) - except IssueMetricEventError: - logger.exception("Failed to issue Reconciliation metric") - - def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: - """Flush Openstack servers. - - 1. Kill the processes depending on flush mode. - 2. Get unhealthy runners after process purging. - 3. Delete unhealthy runners. - - Args: - mode: The mode to determine which runner to flush. - - Returns: - The number of runners flushed. - """ - logger.info("Flushing OpenStack all runners") - with _create_connection(self._cloud_config) as conn: - self._kill_runner_processes(conn=conn, mode=mode) - runner_by_health = self._get_openstack_runner_status(conn) - remove_token = self._github.get_runner_remove_token(path=self._config.path) - self._remove_runners( - conn=conn, - instance_names=runner_by_health.unhealthy, - remove_token=remove_token, - ) - return len(runner_by_health.unhealthy) - - def _kill_runner_processes(self, conn: OpenstackConnection, mode: FlushMode) -> None: - """Kill runner application that are not running any jobs. - - Runners that have not picked up a job has - 1. no Runner.Worker process - 2. no pre-run.sh job process - - Args: - conn: The connection object to access OpenStack cloud. - mode: The flush mode to determine which runner processes to kill. - - Raises: - NotImplementedError: If unsupported flush mode has been passed. - """ - killer_command: str - match mode: - case FlushMode.FLUSH_IDLE: - # only kill Runner.Listener if Runner.Worker does not exist. - killer_command = ( - "! pgrep -x Runner.Worker && pgrep -x Runner.Listener && " - "kill $(pgrep -x Runner.Listener)" - ) - case FlushMode.FLUSH_BUSY: - # kill both Runner.Listener and Runner.Worker processes. - # This kills pre-job.sh, a child process of Runner.Worker. - killer_command = ( - "pgrep -x Runner.Listener && kill $(pgrep -x Runner.Listener);" - "pgrep -x Runner.Worker && kill $(pgrep -x Runner.Worker);" - ) - case _: - raise NotImplementedError(f"Unsupported flush mode {mode}") - - servers = self._get_openstack_instances(conn=conn) - for server in servers: - ssh_conn: SSHConnection = self._get_ssh_connection(conn=conn, server_name=server.name) - result: invoke.runners.Result = ssh_conn.run( - killer_command, - warn=True, - ) - if not result.ok: - logger.warning("Failed to kill runner process. Instance: %s", server.name) - continue - logger.info("Successfully killed runner process. Instance: %s", server.name) diff --git a/src/openstack_cloud/openstack_runner_manager.py b/src/openstack_cloud/openstack_runner_manager.py deleted file mode 100644 index 6323b65fa..000000000 --- a/src/openstack_cloud/openstack_runner_manager.py +++ /dev/null @@ -1,806 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Manager for self-hosted runner on OpenStack.""" - -import logging -import secrets -import time -from dataclasses import dataclass -from pathlib import Path -from typing import Iterator, Sequence - -import invoke -import jinja2 -import paramiko -import paramiko.ssh_exception -from fabric import Connection as SSHConnection - -from charm_state import GithubOrg -from errors import ( - CreateMetricsStorageError, - GetMetricsStorageError, - IssueMetricEventError, - KeyfileError, - OpenStackError, - RunnerCreateError, - RunnerStartError, - SSHError, -) -from manager.cloud_runner_manager import ( - CloudRunnerInstance, - CloudRunnerManager, - CloudRunnerState, - GitHubRunnerConfig, - InstanceId, - SupportServiceConfig, -) -from manager.runner_manager import HealthState -from metrics import events as metric_events -from metrics import runner as runner_metrics -from metrics import storage as metrics_storage -from openstack_cloud.openstack_cloud import OpenstackCloud, OpenstackInstance -from openstack_cloud.openstack_manager import GithubRunnerRemoveError -from repo_policy_compliance_client import RepoPolicyComplianceClient -from utilities import retry - -logger = logging.getLogger(__name__) - -BUILD_OPENSTACK_IMAGE_SCRIPT_FILENAME = "scripts/build-openstack-image.sh" -_CONFIG_SCRIPT_PATH = Path("/home/ubuntu/actions-runner/config.sh") - -RUNNER_APPLICATION = Path("/home/ubuntu/actions-runner") -METRICS_EXCHANGE_PATH = Path("/home/ubuntu/metrics-exchange") -PRE_JOB_SCRIPT = RUNNER_APPLICATION / "pre-job.sh" -MAX_METRICS_FILE_SIZE = 1024 - -RUNNER_STARTUP_PROCESS = "/home/ubuntu/actions-runner/run.sh" -RUNNER_LISTENER_PROCESS = "Runner.Listener" -RUNNER_WORKER_PROCESS = "Runner.Worker" -CREATE_SERVER_TIMEOUT = 5 * 60 - - -class _PullFileError(Exception): - """Represents an error while pulling a file from the runner instance.""" - - -@dataclass -class OpenStackCloudConfig: - """Configuration for OpenStack cloud authorisation information. - - Attributes: - clouds_config: The clouds.yaml. - cloud: The cloud name to connect to. - """ - - clouds_config: dict[str, dict] - cloud: str - - -@dataclass -class OpenStackServerConfig: - """Configuration for OpenStack server. - - Attributes: - image: The image name for runners to use. - flavor: The flavor name for runners to use. - network: The network name for runners to use. - """ - - image: str - flavor: str - network: str - - -@dataclass -class _RunnerHealth: - """Runners with health state. - - Attributes: - healthy: The list of healthy runners. - unhealthy: The list of unhealthy runners. - """ - - healthy: tuple[OpenstackInstance, ...] - unhealthy: tuple[OpenstackInstance, ...] - - -class OpenstackRunnerManager(CloudRunnerManager): - """Manage self-hosted runner on OpenStack cloud. - - Attributes: - name_prefix: The name prefix of the runners created. - """ - - # Ignore "Too many arguments", as the class requires a lot of configurations. - def __init__( # pylint: disable=R0913 - self, - prefix: str, - cloud_config: OpenStackCloudConfig, - server_config: OpenStackServerConfig, - runner_config: GitHubRunnerConfig, - service_config: SupportServiceConfig, - ) -> None: - """Construct the object. - - Args: - prefix: The prefix to runner name. - cloud_config: The configuration for OpenStack authorisation. - server_config: The configuration for creating OpenStack server. - runner_config: The configuration for the runner. - service_config: The configuration of supporting services of the runners. - """ - self._prefix = prefix - self._cloud_config = cloud_config - self._server_config = server_config - self._runner_config = runner_config - self._service_config = service_config - self._openstack_cloud = OpenstackCloud( - clouds_config=self._cloud_config.clouds_config, - cloud=self._cloud_config.cloud, - prefix=self.name_prefix, - ) - - @property - def name_prefix(self) -> str: - """The prefix of runner names. - - Returns: - The prefix of the runner names managed by this class. - """ - return self._prefix - - def create_runner(self, registration_token: str) -> InstanceId: - """Create a self-hosted runner. - - Args: - registration_token: The GitHub registration token for registering runners. - - Raises: - RunnerCreateError: Unable to create runner due to OpenStack issues. - - Returns: - Instance ID of the runner. - """ - start_timestamp = time.time() - instance_id = OpenstackRunnerManager._generate_instance_id() - instance_name = self._openstack_cloud.get_server_name(instance_id=instance_id) - cloud_init = self._generate_cloud_init( - instance_name=instance_name, registration_token=registration_token - ) - try: - instance = self._openstack_cloud.launch_instance( - instance_id=instance_id, - image=self._server_config.image, - flavor=self._server_config.flavor, - network=self._server_config.network, - cloud_init=cloud_init, - ) - except OpenStackError as err: - raise RunnerCreateError(f"Failed to create {instance_name} openstack runner") from err - - self._wait_runner_startup(instance) - self._wait_runner_running(instance) - - end_timestamp = time.time() - OpenstackRunnerManager._issue_runner_installed_metric( - name=instance_name, - flavor=self.name_prefix, - install_start_timestamp=start_timestamp, - install_end_timestamp=end_timestamp, - ) - return instance_id - - def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: - """Get a self-hosted runner by instance id. - - Args: - instance_id: The instance id. - - Returns: - Information on the runner instance. - """ - instance = self._openstack_cloud.get_instance(instance_id) - healthy = self._runner_health_check(instance=instance) - return ( - CloudRunnerInstance( - name=instance.server_name, - instance_id=instance_id, - health=HealthState.HEALTHY if healthy else HealthState.UNHEALTHY, - state=CloudRunnerState.from_openstack_server_status(instance.status), - ) - if instance is not None - else None - ) - - def get_runners( - self, states: Sequence[CloudRunnerState] | None = None - ) -> tuple[CloudRunnerInstance, ...]: - """Get self-hosted runners by state. - - Args: - states: Filter for the runners with these github states. If None all states will be - included. - - Returns: - Information on the runner instances. - """ - instance_list = self._openstack_cloud.get_instances() - instance_list = [ - CloudRunnerInstance( - name=instance.server_name, - instance_id=instance.instance_id, - health=( - HealthState.HEALTHY - if self._runner_health_check(instance) - else HealthState.UNHEALTHY - ), - state=CloudRunnerState.from_openstack_server_status(instance.status), - ) - for instance in instance_list - ] - if states is None: - return tuple(instance_list) - return tuple(instance for instance in instance_list if instance.state in states) - - def delete_runner( - self, instance_id: InstanceId, remove_token: str - ) -> runner_metrics.RunnerMetrics | None: - """Delete self-hosted runners. - - Args: - instance_id: The instance id of the runner to delete. - remove_token: The GitHub remove token. - - Returns: - Any metrics collected during the deletion of the runner. - """ - instance = self._openstack_cloud.get_instance(instance_id) - if instance is None: - logger.warning( - "Unable to delete instance %s as it is not found", - self._openstack_cloud.get_server_name(instance_id), - ) - return None - - extracted_metrics = runner_metrics.extract( - metrics_storage_manager=metrics_storage, runners=set([instance.server_name]) - ) - self._delete_runner(instance, remove_token) - return next(extracted_metrics, None) - - def flush_runners( - self, remove_token: str, busy: bool = False - ) -> Iterator[runner_metrics.RunnerMetrics]: - """Remove idle and/or busy runners. - - Args: - remove_token: - busy: If false, only idle runners are removed. If true, both idle and busy runners are - removed. - - Returns: - Any metrics retrieved from flushed runners. - """ - instance_list = self._openstack_cloud.get_instances() - for instance in instance_list: - try: - self._check_state_and_flush(instance, busy) - except SSHError: - logger.warning( - "Unable to determine state of %s and kill runner process due to SSH issues", - instance.server_name, - ) - continue - return self.cleanup(remove_token) - - def cleanup(self, remove_token: str) -> Iterator[runner_metrics.RunnerMetrics]: - """Cleanup runner and resource on the cloud. - - Args: - remove_token: The GitHub remove token. - - Returns: - Any metrics retrieved from cleanup runners. - """ - runners = self._get_runners_health() - healthy_runner_names = [runner.server_name for runner in runners.healthy] - metrics = runner_metrics.extract( - metrics_storage_manager=metrics_storage, runners=set(healthy_runner_names) - ) - for runner in runners.unhealthy: - self._delete_runner(runner, remove_token) - - self._openstack_cloud.cleanup() - return metrics - - def _delete_runner(self, instance: OpenstackInstance, remove_token: str) -> None: - """Delete self-hosted runners by openstack instance. - - Args: - instance: The OpenStack instance. - remove_token: The GitHub remove token. - """ - try: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - self._pull_runner_metrics(instance.server_name, ssh_conn) - - try: - OpenstackRunnerManager._run_runner_removal_script( - instance.server_name, ssh_conn, remove_token - ) - except GithubRunnerRemoveError: - logger.warning( - "Unable to run github runner removal script for %s", - instance.server_name, - stack_info=True, - ) - except SSHError: - logger.exception( - "Failed to get SSH connection while removing %s", instance.server_name - ) - logger.warning( - "Skipping runner remove script for %s due to SSH issues", instance.server_name - ) - - try: - self._openstack_cloud.delete_instance(instance.instance_id) - except OpenStackError: - logger.exception( - "Unable to delete openstack instance for runner %s", instance.server_name - ) - - def _get_runners_health(self) -> _RunnerHealth: - """Get runners by health state. - - Returns: - Runners by health state. - """ - runner_list = self._openstack_cloud.get_instances() - - healthy, unhealthy = [], [] - for runner in runner_list: - if self._runner_health_check(runner): - healthy.append(runner) - else: - unhealthy.append(runner) - return _RunnerHealth(healthy=tuple(healthy), unhealthy=tuple(unhealthy)) - - def _runner_health_check(self, instance: OpenstackInstance) -> bool: - """Run health check on a runner. - - Args: - instance: The instance hosting the runner to run health check on. - - Returns: - True if runner is healthy. - """ - cloud_state = CloudRunnerState.from_openstack_server_status(instance.status) - return cloud_state not in set( - ( - CloudRunnerState.DELETED, - CloudRunnerState.ERROR, - CloudRunnerState.STOPPED, - ) - ) and self._health_check(instance) - - def _generate_cloud_init(self, instance_name: str, registration_token: str) -> str: - """Generate cloud init userdata. - - This is the script the openstack server runs on startup. - - Args: - instance_name: The name of the instance. - registration_token: The GitHub runner registration token. - - Returns: - The cloud init userdata for openstack instance. - """ - jinja = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"), autoescape=True) - - env_contents = jinja.get_template("env.j2").render( - pre_job_script=str(PRE_JOB_SCRIPT), - dockerhub_mirror=self._service_config.dockerhub_mirror or "", - ssh_debug_info=( - secrets.choice(self._service_config.ssh_debug_connections) - if self._service_config.ssh_debug_connections - else None - ), - ) - - pre_job_contents_dict = { - "issue_metrics": True, - "metrics_exchange_path": str(METRICS_EXCHANGE_PATH), - "do_repo_policy_check": False, - } - repo_policy = self._get_repo_policy_compliance_client() - if repo_policy is not None: - pre_job_contents_dict.update( - { - "repo_policy_base_url": repo_policy.base_url, - "repo_policy_one_time_token": repo_policy.get_one_time_token(), - "do_repo_policy_check": True, - } - ) - - pre_job_contents = jinja.get_template("pre-job.j2").render(pre_job_contents_dict) - - runner_group = None - if isinstance(self._runner_config.github_path, GithubOrg): - runner_group = self._runner_config.github_path.group - aproxy_address = ( - self._service_config.proxy_config.aproxy_address - if self._service_config.proxy_config is not None - else None - ) - return jinja.get_template("openstack-userdata.sh.j2").render( - github_url=f"https://github.com/{self._runner_config.github_path.path()}", - runner_group=runner_group, - token=registration_token, - instance_labels=",".join(self._runner_config.labels), - instance_name=instance_name, - env_contents=env_contents, - pre_job_contents=pre_job_contents, - metrics_exchange_path=str(METRICS_EXCHANGE_PATH), - aproxy_address=aproxy_address, - dockerhub_mirror=self._service_config.dockerhub_mirror, - ) - - def _get_repo_policy_compliance_client(self) -> RepoPolicyComplianceClient | None: - """Get repo policy compliance client. - - Returns: - The repo policy compliance client. - """ - if self._service_config.repo_policy_url and self._service_config.repo_policy_token: - return RepoPolicyComplianceClient( - self._service_config.repo_policy_url, self._service_config.repo_policy_token - ) - return None - - @retry(tries=3, delay=5, backoff=2, local_logger=logger) - def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> None: - """Kill runner process depending on idle or busy. - - Due to update to runner state has some delay with GitHub API. The state of the runner is - determined by which runner processes are running. If the Runner.Worker process is running, - the runner is deemed to be busy. - - Raises: - SSHError: Unable to check the state of the runner and kill the runner process due to - SSH failure. - - Args: - instance: The openstack instance to kill the runner process. - busy: Kill the process if runner is busy, else only kill runner - process if runner is idle. - """ - try: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - except KeyfileError: - logger.exception( - "Health check failed due to unable to find keyfile for %s", instance.server_name - ) - return - except SSHError: - logger.exception( - "SSH connection failure with %s during flushing", instance.server_name - ) - raise - - # Using a single command to determine the state and kill the process if needed. - # This makes it more robust when network is unstable. - if busy: - logger.info("Attempting to kill all runner process on %s", instance.server_name) - # kill both Runner.Listener and Runner.Worker processes. - # This kills pre-job.sh, a child process of Runner.Worker. - kill_command = ( - f"pgrep -x {RUNNER_LISTENER_PROCESS} && kill $(pgrep -x {RUNNER_LISTENER_PROCESS});" - f"pgrep -x {RUNNER_WORKER_PROCESS} && kill $(pgrep -x {RUNNER_WORKER_PROCESS});" - ) - else: - logger.info( - "Attempting to kill runner process on %s if not busy", instance.server_name - ) - # Only kill Runner.Listener if Runner.Worker does not exist. - kill_command = ( - f"pgrep -x {RUNNER_WORKER_PROCESS} || pgrep -x {RUNNER_LISTENER_PROCESS} && " - f"kill $(pgrep -x {RUNNER_LISTENER_PROCESS})" - ) - # Checking the result of kill command is not useful, as the exit code does not reveal much. - ssh_conn.run(kill_command, warn=True) - - @retry(tries=3, delay=5, backoff=2, local_logger=logger) - def _health_check(self, instance: OpenstackInstance) -> bool: - """Check whether runner is healthy. - - Args: - instance: The OpenStack instance to conduit the health check. - - Raises: - SSHError: Unable to get a SSH connection to the instance. - - Returns: - Whether the runner is healthy. - """ - try: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - except KeyfileError: - logger.exception( - "Health check failed due to unable to find keyfile for %s", instance.server_name - ) - return False - except SSHError: - logger.exception( - "SSH connection failure with %s during health check", instance.server_name - ) - raise - return OpenstackRunnerManager._run_health_check(ssh_conn, instance.server_name) - - @staticmethod - def _run_health_check(ssh_conn: SSHConnection, name: str) -> bool: - """Run a health check for runner process. - - Args: - ssh_conn: The SSH connection to the runner. - name: The name of the runner. - - Returns: - Whether the health succeed. - """ - result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) - if not result.ok: - logger.warning("SSH run of `ps aux` failed on %s: %s", name, result.stderr) - return False - if ( - RUNNER_WORKER_PROCESS not in result.stdout - and RUNNER_LISTENER_PROCESS not in result.stdout - ): - logger.warning("Runner process not found on %s", name) - return False - return True - - @retry(tries=10, delay=60, local_logger=logger) - def _wait_runner_startup(self, instance: OpenstackInstance) -> None: - """Wait until runner is startup. - - Args: - instance: The runner instance. - - Raises: - RunnerStartError: The runner startup process was not found on the runner. - """ - try: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - except SSHError as err: - raise RunnerStartError( - f"Failed to SSH to {instance.server_name} during creation possible due to setup " - "not completed" - ) from err - - result: invoke.runners.Result = ssh_conn.run("ps aux", warn=True) - if not result.ok: - logger.warning("SSH run of `ps aux` failed on %s", instance.server_name) - raise RunnerStartError(f"Unable to SSH run `ps aux` on {instance.server_name}") - if RUNNER_STARTUP_PROCESS not in result.stdout: - logger.warning("Runner startup process not found on %s", instance.server_name) - raise RunnerStartError(f"Runner startup process not found on {instance.server_name}") - logger.info("Runner startup process found to be healthy on %s", instance.server_name) - - @retry(tries=5, delay=60, local_logger=logger) - def _wait_runner_running(self, instance: OpenstackInstance) -> None: - """Wait until runner is running. - - Args: - instance: The runner instance. - - Raises: - RunnerStartError: The runner process was not found on the runner. - """ - try: - ssh_conn = self._openstack_cloud.get_ssh_connection(instance) - except SSHError as err: - raise RunnerStartError( - f"Failed to SSH connect to {instance.server_name} openstack runner" - ) from err - - if not self._run_health_check(ssh_conn=ssh_conn, name=instance.server_name): - logger.info("Runner process not found on %s", instance.server_name) - raise RunnerStartError( - f"Runner process on {instance.server_name} failed to initialize on after starting" - ) - - logger.info("Runner process found to be healthy on %s", instance.server_name) - - @staticmethod - def _generate_instance_id() -> InstanceId: - """Generate a instance id. - - Return: - The id. - """ - return secrets.token_hex(12) - - @staticmethod - def _issue_runner_installed_metric( - name: str, - flavor: str, - install_start_timestamp: float, - install_end_timestamp: float, - ) -> None: - """Issue metric for runner installed event. - - Args: - name: The name of the runner. - flavor: The flavor of the runner. - install_start_timestamp: The timestamp of installation start. - install_end_timestamp: The timestamp of installation end. - """ - try: - metric_events.issue_event( - event=metric_events.RunnerInstalled( - timestamp=install_start_timestamp, - flavor=flavor, - duration=install_end_timestamp - install_start_timestamp, - ) - ) - except IssueMetricEventError: - logger.exception("Failed to issue RunnerInstalled metric") - - try: - storage = metrics_storage.create(name) - except CreateMetricsStorageError: - logger.exception( - "Failed to create metrics storage for runner %s, " - "will not be able to issue all metrics.", - name, - ) - else: - try: - (storage.path / runner_metrics.RUNNER_INSTALLED_TS_FILE_NAME).write_text( - str(install_end_timestamp), encoding="utf-8" - ) - except FileNotFoundError: - logger.exception( - "Failed to write runner-installed.timestamp into metrics storage " - "for runner %s, will not be able to issue all metrics.", - name, - ) - - @staticmethod - def _pull_runner_metrics(name: str, ssh_conn: SSHConnection) -> None: - """Pull metrics from runner. - - Args: - name: The name of the runner. - ssh_conn: The SSH connection to the runner. - """ - try: - storage = metrics_storage.get(name) - except GetMetricsStorageError: - logger.exception( - "Failed to get shared metrics storage for runner %s, " - "will not be able to issue all metrics.", - name, - ) - return - - try: - OpenstackRunnerManager._ssh_pull_file( - ssh_conn=ssh_conn, - remote_path=str(METRICS_EXCHANGE_PATH / "pre-job-metrics.json"), - local_path=str(storage.path / "pre-job-metrics.json"), - max_size=MAX_METRICS_FILE_SIZE, - ) - OpenstackRunnerManager._ssh_pull_file( - ssh_conn=ssh_conn, - remote_path=str(METRICS_EXCHANGE_PATH / "post-job-metrics.json"), - local_path=str(storage.path / "post-job-metrics.json"), - max_size=MAX_METRICS_FILE_SIZE, - ) - except _PullFileError as exc: - logger.warning( - "Failed to pull metrics for %s: %s . Will not be able to issue all metrics", - name, - exc, - ) - - @staticmethod - def _ssh_pull_file( - ssh_conn: SSHConnection, remote_path: str, local_path: str, max_size: int - ) -> None: - """Pull file from the runner instance. - - Args: - ssh_conn: The SSH connection instance. - remote_path: The file path on the runner instance. - local_path: The local path to store the file. - max_size: If the file is larger than this, it will not be pulled. - - Raises: - _PullFileError: Unable to pull the file from the runner instance. - SSHError: Issue with SSH connection. - """ - try: - result = ssh_conn.run(f"stat -c %s {remote_path}", warn=True) - except ( - TimeoutError, - paramiko.ssh_exception.NoValidConnectionsError, - paramiko.ssh_exception.SSHException, - ) as exc: - raise SSHError(f"Unable to SSH into {ssh_conn.host}") from exc - if not result.ok: - logger.warning( - ( - "Unable to get file size of %s on instance %s, " - "exit code: %s, stdout: %s, stderr: %s" - ), - remote_path, - ssh_conn.host, - result.return_code, - result.stdout, - result.stderr, - ) - raise _PullFileError(f"Unable to get file size of {remote_path}") - - stdout = result.stdout - try: - stdout.strip() - size = int(stdout) - if size > max_size: - raise _PullFileError(f"File size of {remote_path} too large {size} > {max_size}") - except ValueError as exc: - raise _PullFileError(f"Invalid file size for {remote_path}: stdout") from exc - - try: - ssh_conn.get(remote=remote_path, local=local_path) - except ( - TimeoutError, - paramiko.ssh_exception.NoValidConnectionsError, - paramiko.ssh_exception.SSHException, - ) as exc: - raise SSHError(f"Unable to SSH into {ssh_conn.host}") from exc - except OSError as exc: - raise _PullFileError(f"Unable to retrieve file {remote_path}") from exc - - @staticmethod - def _run_runner_removal_script( - instance_name: str, ssh_conn: SSHConnection, remove_token: str - ) -> None: - """Run Github runner removal script. - - Args: - instance_name: The name of the runner instance. - ssh_conn: The SSH connection to the runner instance. - remove_token: The GitHub instance removal token. - - Raises: - GithubRunnerRemoveError: Unable to remove runner from GitHub. - """ - try: - result = ssh_conn.run( - f"{_CONFIG_SCRIPT_PATH} remove --token {remove_token}", - warn=True, - ) - if result.ok: - return - - logger.warning( - ( - "Unable to run removal script on instance %s, " - "exit code: %s, stdout: %s, stderr: %s" - ), - instance_name, - result.return_code, - result.stdout, - result.stderr, - ) - raise GithubRunnerRemoveError(f"Failed to remove runner {instance_name} from Github.") - except ( - TimeoutError, - paramiko.ssh_exception.NoValidConnectionsError, - paramiko.ssh_exception.SSHException, - ) as exc: - raise GithubRunnerRemoveError( - f"Failed to remove runner {instance_name} from Github." - ) from exc diff --git a/src/reactive/__init__.py b/src/reactive/__init__.py deleted file mode 100644 index 1c7b82dda..000000000 --- a/src/reactive/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Package for code implementing reactive scheduling.""" diff --git a/src/reactive/consumer.py b/src/reactive/consumer.py deleted file mode 100644 index f868feddd..000000000 --- a/src/reactive/consumer.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Module responsible for consuming jobs from the message queue.""" -import contextlib -import logging -import signal -import sys -from contextlib import closing -from types import FrameType -from typing import Generator, cast - -from kombu import Connection -from kombu.simple import SimpleQueue -from pydantic import BaseModel, HttpUrl, ValidationError - -logger = logging.getLogger(__name__) - - -class JobDetails(BaseModel): - """A class to translate the payload. - - Attributes: - labels: The labels of the job. - run_url: The URL of the job. - """ - - labels: list[str] - run_url: HttpUrl - - -class JobError(Exception): - """Raised when a job error occurs.""" - - -def consume(mongodb_uri: str, queue_name: str) -> None: - """Consume a job from the message queue. - - Log the job details and acknowledge the message. - If the job details are invalid, reject the message and raise an error. - - Args: - mongodb_uri: The URI of the MongoDB database. - queue_name: The name of the queue. - - Raises: - JobError: If the job details are invalid. - """ - with Connection(mongodb_uri) as conn: - with closing(SimpleQueue(conn, queue_name)) as simple_queue: - with signal_handler(signal.SIGTERM): - msg = simple_queue.get(block=True) - try: - job_details = cast(JobDetails, JobDetails.parse_raw(msg.payload)) - except ValidationError as exc: - msg.reject(requeue=True) - raise JobError(f"Invalid job details: {msg.payload}") from exc - logger.info( - "Received job with labels %s and run_url %s", - job_details.labels, - job_details.run_url, - ) - msg.ack() - - -@contextlib.contextmanager -def signal_handler(signal_code: signal.Signals) -> Generator[None, None, None]: - """Set a signal handler and after the context, restore the default handler. - - The signal handler exits the process. - - Args: - signal_code: The signal code to handle. - """ - _set_signal_handler(signal_code) - try: - yield - finally: - _restore_signal_handler(signal_code) - - -def _set_signal_handler(signal_code: signal.Signals) -> None: - """Set a signal handler which exits the process. - - Args: - signal_code: The signal code to handle. - """ - - def sigterm_handler(signal_code: int, _: FrameType | None) -> None: - """Handle a signal. - - Call sys.exit with the signal code. Kombu should automatically - requeue unacknowledged messages. - - Args: - signal_code: The signal code to handle. - """ - print( - f"Signal '{signal.strsignal(signal_code)}' received. Will terminate.", file=sys.stderr - ) - sys.exit(signal_code) - - signal.signal(signal_code, sigterm_handler) - - -def _restore_signal_handler(signal_code: signal.Signals) -> None: - """Restore the default signal handler. - - Args: - signal_code: The signal code to restore. - """ - signal.signal(signal_code, signal.SIG_DFL) diff --git a/src/reactive/runner_manager.py b/src/reactive/runner_manager.py deleted file mode 100644 index 5799731ee..000000000 --- a/src/reactive/runner_manager.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Module for managing reactive runners.""" -import logging -import os -import shutil -import signal - -# All commands run by subprocess are secure. -import subprocess # nosec -from pathlib import Path - -from utilities import secure_run_subprocess - -logger = logging.getLogger(__name__) - -MQ_URI_ENV_VAR = "MQ_URI" -QUEUE_NAME_ENV_VAR = "QUEUE_NAME" -REACTIVE_RUNNER_LOG_DIR = Path("/var/log/reactive_runner") -REACTIVE_RUNNER_SCRIPT_FILE = "scripts/reactive_runner.py" -PYTHON_BIN = "/usr/bin/python3" -REACTIVE_RUNNER_CMD_LINE_PREFIX = f"{PYTHON_BIN} {REACTIVE_RUNNER_SCRIPT_FILE}" -PID_CMD_COLUMN_WIDTH = len(REACTIVE_RUNNER_CMD_LINE_PREFIX) -PIDS_COMMAND_LINE = [ - "ps", - "axo", - f"cmd:{PID_CMD_COLUMN_WIDTH},pid", - "--no-headers", - "--sort=-start_time", -] -UBUNTU_USER = "ubuntu" - - -class ReactiveRunnerError(Exception): - """Raised when a reactive runner error occurs.""" - - -def reconcile(quantity: int, mq_uri: str, queue_name: str) -> int: - """Spawn a runner reactively. - - Args: - quantity: The number of runners to spawn. - mq_uri: The message queue URI. - queue_name: The name of the queue. - - Raises a ReactiveRunnerError if the runner fails to spawn. - - Returns: - The number of reactive runner processes spawned. - """ - pids = _get_pids() - current_quantity = len(pids) - logger.info("Current quantity of reactive runner processes: %s", current_quantity) - delta = quantity - current_quantity - if delta > 0: - logger.info("Will spawn %d new reactive runner process(es)", delta) - _setup_logging_for_processes() - for _ in range(delta): - _spawn_runner(mq_uri=mq_uri, queue_name=queue_name) - elif delta < 0: - logger.info("Will kill %d process(es).", -delta) - for pid in pids[:-delta]: - logger.info("Killing reactive runner process with pid %s", pid) - try: - os.kill(pid, signal.SIGTERM) - except ProcessLookupError: - # There can be a race condition that the process has already terminated. - # We just ignore and log the fact. - logger.info( - "Failed to kill process with pid %s. Process might have terminated it self.", - pid, - ) - else: - logger.info("No changes to number of reactive runner processes needed.") - - return delta - - -def _get_pids() -> list[int]: - """Get the PIDs of the reactive runners processes. - - Returns: - The PIDs of the reactive runner processes sorted by start time in descending order. - - Raises: - ReactiveRunnerError: If the command to get the PIDs fails - """ - result = secure_run_subprocess(cmd=PIDS_COMMAND_LINE) - if result.returncode != 0: - raise ReactiveRunnerError("Failed to get list of processes") - - return [ - int(line.rstrip().rsplit(maxsplit=1)[-1]) - for line in result.stdout.decode().split("\n") - if line.startswith(REACTIVE_RUNNER_CMD_LINE_PREFIX) - ] - - -def _setup_logging_for_processes() -> None: - """Set up the log dir.""" - if not REACTIVE_RUNNER_LOG_DIR.exists(): - REACTIVE_RUNNER_LOG_DIR.mkdir() - shutil.chown(REACTIVE_RUNNER_LOG_DIR, user=UBUNTU_USER, group=UBUNTU_USER) - - -def _spawn_runner(mq_uri: str, queue_name: str) -> None: - """Spawn a runner. - - Args: - mq_uri: The message queue URI. - queue_name: The name of the queue. - """ - env = { - "PYTHONPATH": "src:lib:venv", - MQ_URI_ENV_VAR: mq_uri, - QUEUE_NAME_ENV_VAR: queue_name, - } - # We do not want to wait for the process to finish, so we do not use with statement. - # We trust the command. - command = " ".join( - [ - PYTHON_BIN, - REACTIVE_RUNNER_SCRIPT_FILE, - ">>", - # $$ will be replaced by the PID of the process, so we can track the error log easily. - f"{REACTIVE_RUNNER_LOG_DIR}/$$.log", - "2>&1", - ] - ) - logger.debug("Spawning a new reactive runner process with command: %s", command) - process = subprocess.Popen( # pylint: disable=consider-using-with # nosec - command, - shell=True, - env=env, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - user=UBUNTU_USER, - ) - - logger.info("Spawned a new reactive runner process with pid %s", process.pid) diff --git a/src/repo_policy_compliance_client.py b/src/repo_policy_compliance_client.py deleted file mode 100644 index 6dbc1d919..000000000 --- a/src/repo_policy_compliance_client.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Client for requesting repo policy compliance service.""" - -import logging -from urllib.parse import urljoin - -import requests -import urllib3 - -logger = logging.getLogger(__name__) - - -# Disable pylint public method number check as this class can be extended in the future. -class RepoPolicyComplianceClient: # pylint: disable=too-few-public-methods - """Client for repo policy compliance service. - - Attributes: - base_url: Base url to the repo policy compliance service. - token: Charm token configured for the repo policy compliance service. - """ - - def __init__(self, url: str, charm_token: str) -> None: - """Construct the RepoPolicyComplianceClient. - - Args: - url: Base URL to the repo policy compliance service. - charm_token: Charm token configured for the repo policy compliance service. - """ - self._session = self._create_session() - self.base_url = url - self.token = charm_token - - def get_one_time_token(self) -> str: - """Get a single-use token for repo policy compliance check. - - Raises: - HTTPError: If there was an error getting one-time token from repo-policy-compliance \ - service. - - Returns: - The one-time token to be used in a single request of repo policy compliance check. - """ - url = urljoin(self.base_url, "one-time-token") - try: - response = self._session.get(url, headers={"Authorization": f"Bearer {self.token}"}) - response.raise_for_status() - return response.content.decode("utf-8") - except requests.HTTPError: - logger.exception("Unable to get one time token from repo policy compliance service.") - raise - - def _create_session(self) -> requests.Session: - """Create a new requests session. - - Returns: - A new requests session with retries and no proxy settings. - """ - # The repo policy compliance service might be on localhost and should not have any proxies - # setting configured. This can be changed in the future when we also rely on an - # external service for LXD cloud. - adapter = requests.adapters.HTTPAdapter( - max_retries=urllib3.Retry( - total=3, backoff_factor=0.3, status_forcelist=[500, 502, 503, 504] - ) - ) - - session = requests.Session() - session.mount("http://", adapter) - session.mount("https://", adapter) - session.trust_env = False - return session diff --git a/src/runner.py b/src/runner.py index 4610faded..a1b64fcfc 100644 --- a/src/runner.py +++ b/src/runner.py @@ -21,9 +21,12 @@ from typing import Iterable, NamedTuple, Optional, Sequence import yaml +from github_runner_manager.metrics.runner_logs import SYSLOG_PATH, create_logs_dir +from github_runner_manager.metrics.storage import MetricsStorage +from github_runner_manager.types_.github import GitHubOrg import shared_fs -from charm_state import Arch, GithubOrg, SSHDebugConnection, VirtualMachineResources +from charm_state import Arch, SSHDebugConnection, VirtualMachineResources from errors import ( CreateMetricsStorageError, GithubClientError, @@ -38,8 +41,6 @@ ) from lxd import LxdInstance from lxd_type import LxdInstanceConfig -from metrics.runner_logs import SYSLOG_PATH, create_logs_dir -from metrics.storage import MetricsStorage from runner_manager_type import RunnerManagerClients from runner_type import RunnerConfig, RunnerStatus from utilities import execute_command, retry @@ -838,7 +839,7 @@ def _register_runner(self, registration_token: str, labels: Sequence[str]) -> No self.instance.name, ] - if isinstance(self.config.path, GithubOrg): + if isinstance(self.config.path, GitHubOrg): register_cmd += ["--runnergroup", self.config.path.group] logger.info("Executing registration command...") diff --git a/src/runner_manager.py b/src/runner_manager.py index 8d68a68c9..66a7e03d3 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -13,12 +13,19 @@ from pathlib import Path from typing import Iterator, Optional, Type +import github_runner_manager.reactive.runner_manager as reactive_runner_manager import jinja2 import requests import requests.adapters import urllib3 +from github_runner_manager.metrics import events as metric_events +from github_runner_manager.metrics import github as github_metrics +from github_runner_manager.metrics import runner as runner_metrics +from github_runner_manager.metrics import runner_logs +from github_runner_manager.metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME +from github_runner_manager.repo_policy_compliance_client import RepoPolicyComplianceClient +from github_runner_manager.types_.github import RunnerApplication, SelfHostedRunner -import reactive.runner_manager as reactive_runner_manager import shared_fs from charm_state import VirtualMachineResources from errors import ( @@ -32,16 +39,14 @@ SubprocessError, ) from github_client import GithubClient -from github_type import RunnerApplication, SelfHostedRunner from lxd import LxdClient, LxdInstance -from metrics import events as metric_events -from metrics import github as github_metrics -from metrics import runner as runner_metrics -from metrics import runner_logs -from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME -from repo_policy_compliance_client import RepoPolicyComplianceClient from runner import LXD_PROFILE_YAML, CreateRunnerConfig, Runner, RunnerConfig, RunnerStatus -from runner_manager_type import FlushMode, RunnerInfo, RunnerManagerClients, RunnerManagerConfig +from runner_manager_type import ( + LXDFlushMode, + LXDRunnerManagerConfig, + RunnerInfo, + RunnerManagerClients, +) from runner_type import ProxySetting as RunnerProxySetting from runner_type import RunnerNameByHealth from utilities import execute_command, retry, set_env_var @@ -56,7 +61,7 @@ IssuedMetricEventsStats = dict[Type[metric_events.Event], int] -class RunnerManager: +class LXDRunnerManager: """Manage a group of runners according to configuration. Attributes: @@ -71,7 +76,7 @@ def __init__( self, app_name: str, unit: int, - runner_manager_config: RunnerManagerConfig, + runner_manager_config: LXDRunnerManagerConfig, ) -> None: """Construct RunnerManager object for creating and managing runners. @@ -159,7 +164,7 @@ def update_runner_bin(self, binary: RunnerApplication) -> None: try: # Delete old version of runner binary. - RunnerManager.runner_bin_path.unlink(missing_ok=True) + LXDRunnerManager.runner_bin_path.unlink(missing_ok=True) except OSError as err: logger.exception("Unable to perform file operation on the runner binary path") raise RunnerBinaryError("File operation failed on the runner binary path") from err @@ -182,7 +187,7 @@ def update_runner_bin(self, binary: RunnerApplication) -> None: sha256 = hashlib.sha256() - with RunnerManager.runner_bin_path.open(mode="wb") as file: + with LXDRunnerManager.runner_bin_path.open(mode="wb") as file: # Process with chunk_size of 128 KiB. for chunk in response.iter_content(chunk_size=128 * 1024, decode_unicode=False): file.write(chunk) @@ -267,7 +272,7 @@ def _create_runner( config=CreateRunnerConfig( image=self.config.image, resources=resources, - binary_path=RunnerManager.runner_bin_path, + binary_path=LXDRunnerManager.runner_bin_path, registration_token=registration_token, arch=self.config.charm_state.arch, ) @@ -309,7 +314,7 @@ def _create_runner( config=CreateRunnerConfig( image=self.config.image, resources=resources, - binary_path=RunnerManager.runner_bin_path, + binary_path=LXDRunnerManager.runner_bin_path, registration_token=registration_token, arch=self.config.charm_state.arch, ) @@ -447,7 +452,7 @@ def _spawn_new_runners(self, count: int, resources: VirtualMachineResources) -> Raises: RunnerCreateError: If there was an error spawning new runner. """ - if not RunnerManager.runner_bin_path.exists(): + if not LXDRunnerManager.runner_bin_path.exists(): raise RunnerCreateError("Unable to create runner due to missing runner binary.") logger.info("Getting registration token for GitHub runners.") registration_token = self._clients.github.get_runner_registration_token(self.config.path) @@ -619,7 +624,7 @@ def _runners_in_pre_job(self) -> bool: return False return True - def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: + def flush(self, mode: LXDFlushMode = LXDFlushMode.FLUSH_IDLE) -> int: """Remove existing runners. Args: @@ -636,7 +641,7 @@ def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: remove_token = self._clients.github.get_runner_remove_token(self.config.path) except GithubClientError: logger.exception("Failed to get remove-token to unregister runners from GitHub.") - if mode != FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK: + if mode != LXDFlushMode.FORCE_FLUSH_WAIT_REPO_CHECK: raise logger.info("Proceeding with flush without remove-token.") remove_token = None @@ -656,9 +661,9 @@ def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: logger.info(REMOVED_RUNNER_LOG_STR, runner.config.name) if mode in ( - FlushMode.FLUSH_IDLE_WAIT_REPO_CHECK, - FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK, - FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK, + LXDFlushMode.FLUSH_IDLE_WAIT_REPO_CHECK, + LXDFlushMode.FLUSH_BUSY_WAIT_REPO_CHECK, + LXDFlushMode.FORCE_FLUSH_WAIT_REPO_CHECK, ): for _ in range(5): if not self._runners_in_pre_job(): @@ -673,9 +678,9 @@ def flush(self, mode: FlushMode = FlushMode.FLUSH_IDLE) -> int: ) if mode in ( - FlushMode.FLUSH_BUSY_WAIT_REPO_CHECK, - FlushMode.FLUSH_BUSY, - FlushMode.FORCE_FLUSH_WAIT_REPO_CHECK, + LXDFlushMode.FLUSH_BUSY_WAIT_REPO_CHECK, + LXDFlushMode.FLUSH_BUSY, + LXDFlushMode.FORCE_FLUSH_WAIT_REPO_CHECK, ): busy_runners = [runner for runner in self._get_runners() if runner.status.exist] diff --git a/src/runner_manager_type.py b/src/runner_manager_type.py index f3a2112f5..deb30540b 100644 --- a/src/runner_manager_type.py +++ b/src/runner_manager_type.py @@ -9,15 +9,15 @@ from typing import Iterable import jinja2 +from github_runner_manager.repo_policy_compliance_client import RepoPolicyComplianceClient +from github_runner_manager.types_.github import GitHubPath, GitHubRunnerStatus -from charm_state import CharmState, GithubPath, ReactiveConfig +from charm_state import CharmState, ReactiveConfig from github_client import GithubClient -from github_type import GitHubRunnerStatus from lxd import LxdClient -from repo_policy_compliance_client import RepoPolicyComplianceClient -class FlushMode(Enum): +class LXDFlushMode(Enum): """Strategy for flushing runners. During pre-job (repo-check), the runners are marked as idle and if the pre-job fails, the @@ -61,7 +61,7 @@ class RunnerManagerClients: @dataclass # The instance attributes are all required. -class RunnerManagerConfig: # pylint: disable=too-many-instance-attributes +class LXDRunnerManagerConfig: # pylint: disable=too-many-instance-attributes """Configuration of runner manager. Attributes: @@ -81,7 +81,7 @@ class RunnerManagerConfig: # pylint: disable=too-many-instance-attributes charm_state: CharmState image: str lxd_storage_path: Path - path: GithubPath + path: GitHubPath service_token: str token: str dockerhub_mirror: str | None = None @@ -113,7 +113,7 @@ class OpenstackRunnerManagerConfig: # pylint: disable=too-many-instance-attribu """ charm_state: CharmState - path: GithubPath + path: GitHubPath labels: Iterable[str] token: str flavor: str diff --git a/src/runner_type.py b/src/runner_type.py index 86769eafd..eec8793ee 100644 --- a/src/runner_type.py +++ b/src/runner_type.py @@ -8,7 +8,9 @@ from pathlib import Path from typing import Optional -from charm_state import GithubPath, SSHDebugConnection +from github_runner_manager.types_.github import GitHubPath + +from charm_state import SSHDebugConnection @dataclass @@ -64,7 +66,7 @@ class RunnerConfig: # pylint: disable=too-many-instance-attributes labels: tuple[str] lxd_storage_path: Path name: str - path: GithubPath + path: GitHubPath proxies: ProxySetting dockerhub_mirror: str | None = None ssh_debug_connections: list[SSHDebugConnection] | None = None diff --git a/src/shared_fs.py b/src/shared_fs.py index 28e97c4fb..48c392113 100644 --- a/src/shared_fs.py +++ b/src/shared_fs.py @@ -7,7 +7,8 @@ from pathlib import Path from typing import Iterator -import metrics.storage as metrics_storage +import github_runner_manager.metrics.storage as metrics_storage + from errors import ( CreateMetricsStorageError, DeleteMetricsStorageError, diff --git a/src/utilities.py b/src/utilities.py index a19effc5c..86c32c4d2 100644 --- a/src/utilities.py +++ b/src/utilities.py @@ -3,14 +3,18 @@ """Utilities used by the charm.""" -import functools import logging import os import pathlib import subprocess # nosec B404 -import time -from typing import Any, Callable, Optional, Sequence, Type, TypeVar - +from typing import Any, Optional, Sequence, TypeVar + +# we import the functions from the utilities module, these are used in the charm +from github_runner_manager.utilities import retry # noqa: F401 pylint: disable=unused-import +from github_runner_manager.utilities import ( # noqa: F401 pylint: disable=unused-import + secure_run_subprocess, + set_env_var, +) from typing_extensions import ParamSpec from errors import SubprocessError @@ -24,130 +28,6 @@ ReturnT = TypeVar("ReturnT") -# This decorator has default arguments, one extra argument is not a problem. -def retry( # pylint: disable=too-many-arguments - exception: Type[Exception] = Exception, - tries: int = 1, - delay: float = 0, - max_delay: Optional[float] = None, - backoff: float = 1, - local_logger: logging.Logger = logger, -) -> Callable[[Callable[ParamT, ReturnT]], Callable[ParamT, ReturnT]]: - """Parameterize the decorator for adding retry to functions. - - Args: - exception: Exception type to be retried. - tries: Number of attempts at retry. - delay: Time in seconds to wait between retry. - max_delay: Max time in seconds to wait between retry. - backoff: Factor to increase the delay by each retry. - local_logger: Logger for logging. - - Returns: - The function decorator for retry. - """ - - def retry_decorator( - func: Callable[ParamT, ReturnT], - ) -> Callable[ParamT, ReturnT]: - """Decorate function with retry. - - Args: - func: The function to decorate. - - Returns: - The resulting function with retry added. - """ - - @functools.wraps(func) - def fn_with_retry(*args: ParamT.args, **kwargs: ParamT.kwargs) -> ReturnT: - """Wrap the function with retries. - - Args: - args: The placeholder for decorated function's positional arguments. - kwargs: The placeholder for decorated function's key word arguments. - - Raises: - RuntimeError: Should be unreachable. - - Returns: - Original return type of the decorated function. - """ - remain_tries, current_delay = tries, delay - - for _ in range(tries): - try: - return func(*args, **kwargs) - # Error caught is set by the input of the function. - except exception as err: # pylint: disable=broad-exception-caught - remain_tries -= 1 - - if remain_tries == 0: - if local_logger is not None: - local_logger.exception("Retry limit of %s exceed: %s", tries, err) - raise - - if local_logger is not None: - local_logger.warning( - "Retrying error in %s seconds: %s", current_delay, err - ) - local_logger.debug("Error to be retried:", stack_info=True) - - time.sleep(current_delay) - - current_delay *= backoff - - if max_delay is not None: - current_delay = min(current_delay, max_delay) - - raise RuntimeError("Unreachable code of retry logic.") - - return fn_with_retry - - return retry_decorator - - -def secure_run_subprocess( - cmd: Sequence[str], hide_cmd: bool = False, **kwargs: dict[str, Any] -) -> subprocess.CompletedProcess[bytes]: - """Run command in subprocess according to security recommendations. - - CalledProcessError will not be raised on error of the command executed. - Errors should be handled by the caller by checking the exit code. - - The command is executed with `subprocess.run`, additional arguments can be passed to it as - keyword arguments. The following arguments to `subprocess.run` should not be set: - `capture_output`, `shell`, `check`. As those arguments are used by this function. - - Args: - cmd: Command in a list. - hide_cmd: Hide logging of cmd. - kwargs: Additional keyword arguments for the `subprocess.run` call. - - Returns: - Object representing the completed process. The outputs subprocess can accessed. - """ - if not hide_cmd: - logger.info("Executing command %s", cmd) - else: - logger.info("Executing sensitive command") - - result = subprocess.run( # nosec B603 - cmd, - capture_output=True, - # Not running in shell to avoid security problems. - shell=False, - check=False, - # Disable type check due to the support for unpacking arguments in mypy is experimental. - **kwargs, # type: ignore - ) - if not hide_cmd: - logger.debug("Command %s returns: %s", cmd, result.stdout) - else: - logger.debug("Command returns: %s", result.stdout) - return result - - def execute_command(cmd: Sequence[str], check_exit: bool = True, **kwargs: Any) -> tuple[str, int]: """Execute a command on a subprocess. @@ -203,19 +83,6 @@ def get_env_var(env_var: str) -> Optional[str]: return os.environ.get(env_var.upper(), os.environ.get(env_var.lower(), None)) -def set_env_var(env_var: str, value: str) -> None: - """Set the environment variable value. - - Set the all upper case and all low case of the `env_var`. - - Args: - env_var: Name of the environment variable. - value: Value to set environment variable to. - """ - os.environ[env_var.upper()] = value - os.environ[env_var.lower()] = value - - def bytes_with_unit_to_kib(num_bytes: str) -> int: """Convert a positive integer followed by a unit to number of kibibytes. diff --git a/templates/dispatch-event.service.j2 b/templates/dispatch-event.service.j2 index 7bef08f96..b4482795f 100644 --- a/templates/dispatch-event.service.j2 +++ b/templates/dispatch-event.service.j2 @@ -4,7 +4,7 @@ Description=Dispatch the {{event}} event on {{unit}} [Service] Type=oneshot # For juju 3 and juju 2 compatibility. The juju-run binary was renamed to juju-exec for juju 3. -ExecStart=/usr/bin/run-one /usr/bin/bash -c '/usr/bin/juju-exec "{{unit}}" "JUJU_DISPATCH_PATH={{event}} ./dispatch" || /usr/bin/juju-run "{{unit}}" "JUJU_DISPATCH_PATH={{event}} ./dispatch"' +ExecStart=/usr/bin/timeout "{{timeout}}" /usr/bin/run-one /usr/bin/bash -c '/usr/bin/juju-exec "{{unit}}" "JUJU_DISPATCH_PATH={{event}} ./dispatch" || /usr/bin/juju-run "{{unit}}" "JUJU_DISPATCH_PATH={{event}} ./dispatch"' [Install] WantedBy=multi-user.target diff --git a/templates/openstack-userdata.sh.j2 b/templates/openstack-userdata.sh.j2 deleted file mode 100644 index 047a62be1..000000000 --- a/templates/openstack-userdata.sh.j2 +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/sh - -set -e - -hostnamectl set-hostname github-runner - -# Write .env contents -su - ubuntu -c 'cd ~/actions-runner && echo "{{ env_contents }}" > .env' - -{% if aproxy_address %} -snap install aproxy --edge -snap set aproxy proxy={{ aproxy_address }} listen=:54969 -cat << EOF > /etc/nftables.conf -define default-ip = $(ip route get $(ip route show 0.0.0.0/0 | grep -oP 'via \K\S+') | grep -oP 'src \K\S+') -define private-ips = { 10.0.0.0/8, 127.0.0.1/8, 172.16.0.0/12, 192.168.0.0/16 } -table ip aproxy -flush table ip aproxy -table ip aproxy { - chain prerouting { - type nat hook prerouting priority dstnat; policy accept; - ip daddr != \$private-ips tcp dport { 80, 443 } counter dnat to \$default-ip:54969 - } - - chain output { - type nat hook output priority -100; policy accept; - ip daddr != \$private-ips tcp dport { 80, 443 } counter dnat to \$default-ip:54969 - } -} -EOF -systemctl enable nftables.service -nft -f /etc/nftables.conf -{% endif %} - -adduser ubuntu lxd -adduser ubuntu adm - -{% if dockerhub_mirror %} -echo "{\"registry-mirrors\": [\"{{ dockerhub_mirror }}\"]}" > /etc/docker/daemon.json -sudo systemctl daemon-reload -sudo systemctl restart docker -{% endif %} - -# Prepare metrics -su - ubuntu -c 'mkdir "{{ metrics_exchange_path }}"' - -# Insert pre-job script -cat << 'EOF' | su - ubuntu -c 'tee /home/ubuntu/actions-runner/pre-job.sh' -{{ pre_job_contents | safe }} -EOF - -# Create the runner and start the configuration experience -{% if runner_group %} -su - ubuntu -c "cd ~/actions-runner && ./config.sh \ - --url {{ github_url }} \ - --runnergroup '{{ runner_group }}' \ - --token {{ token }} --ephemeral --unattended \ - --labels {{ instance_labels }} --name {{ instance_name }}" -{% else %} -su - ubuntu -c "cd ~/actions-runner && ./config.sh \ - --url {{ github_url }} \ - --token {{ token }} --ephemeral --unattended \ - --labels {{ instance_labels }} --name {{ instance_name }}" -{% endif %} - - -write_post_metrics(){ - # Expects the exit code of the run.sh script as the first argument. - - # Only write the post-job metrics if the file does not already exist - which may indicate - # that the job has failed inside pre-job. - - if [ -f {{ metrics_exchange_path}}/post-job-metrics.json ]; then - return - fi - - timestamp=$(date +%s) - - # Write the post-job metrics using status abnormal and exit code if exit code is non-zero - if [ "$1" != "0" ]; then - sudo -g ubuntu -u ubuntu jq -n \ - --argjson timestamp "$timestamp" \ - --arg status "abnormal" \ - --argjson exit_code "$1" \ - '{ - "timestamp": $timestamp, - "status": $status, - "status_info": {code: $exit_code} - }' > "{{ metrics_exchange_path}}/post-job-metrics.json" - return - else - # If exit code is zero, write the post-job metrics using status normal - sudo -g ubuntu -u ubuntu jq -n \ - --argjson timestamp "$timestamp" \ - '{ - "timestamp": $timestamp, - "status": "normal" - }' > "{{ metrics_exchange_path }}/post-job-metrics.json" - fi -} - -# Run runner -# We want to capture the exit code of the run.sh script and write the post-job metrics. -(set +e; su - ubuntu -c "cd ~/actions-runner && /home/ubuntu/actions-runner/run.sh"; write_post_metrics $?) - -su - ubuntu -c "touch /home/ubuntu/run-completed" diff --git a/tests/integration/helpers/charm_metrics.py b/tests/integration/helpers/charm_metrics.py index b6c2f05bc..15cd7e3db 100644 --- a/tests/integration/helpers/charm_metrics.py +++ b/tests/integration/helpers/charm_metrics.py @@ -14,12 +14,12 @@ from github.Repository import Repository from github.Workflow import Workflow from github.WorkflowJob import WorkflowJob +from github_runner_manager.metrics.events import METRICS_LOG_PATH +from github_runner_manager.metrics.runner import PostJobStatus +from github_runner_manager.types_.github import JobConclusion from juju.application import Application from juju.unit import Unit -from github_type import JobConclusion -from metrics.events import METRICS_LOG_PATH -from metrics.runner import PostJobStatus from tests.integration.helpers.common import ( InstanceHelper, get_file_content, diff --git a/tests/integration/helpers/common.py b/tests/integration/helpers/common.py index 16622c038..495c952b3 100644 --- a/tests/integration/helpers/common.py +++ b/tests/integration/helpers/common.py @@ -36,7 +36,7 @@ TOKEN_CONFIG_NAME, VIRTUAL_MACHINES_CONFIG_NAME, ) -from runner_manager import RunnerManager +from runner_manager import LXDRunnerManager from tests.status_name import ACTIVE DISPATCH_TEST_WORKFLOW_FILENAME = "workflow_dispatch_test.yaml" @@ -93,7 +93,7 @@ async def check_runner_binary_exists(unit: Unit) -> bool: Returns: Whether the runner binary file exists in the charm. """ - return_code, _, _ = await run_in_unit(unit, f"test -f {RunnerManager.runner_bin_path}") + return_code, _, _ = await run_in_unit(unit, f"test -f {LXDRunnerManager.runner_bin_path}") return return_code == 0 @@ -141,10 +141,10 @@ async def remove_runner_bin(unit: Unit) -> None: Args: unit: Unit instance to check for the LXD profile. """ - await run_in_unit(unit, f"rm {RunnerManager.runner_bin_path}") + await run_in_unit(unit, f"rm {LXDRunnerManager.runner_bin_path}") # No file should exists under with the filename. - return_code, _, _ = await run_in_unit(unit, f"test -f {RunnerManager.runner_bin_path}") + return_code, _, _ = await run_in_unit(unit, f"test -f {LXDRunnerManager.runner_bin_path}") assert return_code != 0 diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index b2d7624a6..a77c14604 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -2,9 +2,11 @@ # See LICENSE file for licensing details. import logging import secrets +from asyncio import sleep from typing import Optional, TypedDict, cast import openstack.connection +from github_runner_manager.openstack_cloud.openstack_cloud import OpenstackCloud from juju.application import Application from juju.unit import Unit from openstack.compute.v2.server import Server @@ -40,8 +42,9 @@ async def expose_to_instance( unit: The juju unit of the github-runner charm. port: The port on the juju machine to expose to the runner. """ - runner = self._get_runner(unit=unit) + runner = self._get_single_runner(unit=unit) assert runner, f"Runner not found for unit {unit.name}" + logger.info("[TEST SETUP] Exposing port %s on %s", port, runner.name) network_address_list = runner.addresses.values() logger.warning(network_address_list) assert ( @@ -55,9 +58,24 @@ async def expose_to_instance( break assert ip, f"Failed to get IP address for OpenStack server {runner.name}" - ssh_cmd = f'ssh -fNT -R {port}:localhost:{port} -i /home/ubuntu/.ssh/runner-{runner.name}.key -o "StrictHostKeyChecking no" -o "ControlPersist yes" ubuntu@{ip} &' + key_path = OpenstackCloud._get_key_path(runner.name) + exit_code, _, _ = await run_in_unit(unit, f"ls {key_path}") + assert exit_code == 0, f"Unable to find key file {key_path}" + ssh_cmd = f'ssh -fNT -R {port}:localhost:{port} -i {key_path} -o "StrictHostKeyChecking no" -o "ControlPersist yes" ubuntu@{ip} &' exit_code, _, stderr = await run_in_unit(unit, ssh_cmd) - assert exit_code == 0, f"Error in SSH remote forwarding of port {port}: {stderr}" + assert ( + exit_code == 0 + ), f"Error in starting background process of SSH remote forwarding of port {port}: {stderr}" + + await sleep(1) + for _ in range(6): + exit_code, _, _ = await self.run_in_instance( + unit=unit, command=f"nc -z localhost {port}" + ) + if exit_code == 0: + return + await sleep(10) + assert False, f"Exposing the port {port} failed" async def run_in_instance( self, @@ -79,8 +97,9 @@ async def run_in_instance( Returns: Tuple of return code, stdout and stderr. """ - runner = self._get_runner(unit=unit) + runner = self._get_single_runner(unit=unit) assert runner, f"Runner not found for unit {unit.name}" + logger.info("[TEST SETUP] Run command %s on %s", command, runner.name) network_address_list = runner.addresses.values() logger.warning(network_address_list) assert ( @@ -94,7 +113,10 @@ async def run_in_instance( break assert ip, f"Failed to get IP address for OpenStack server {runner.name}" - ssh_cmd = f'ssh -i /home/ubuntu/.ssh/runner-{runner.name}.key -o "StrictHostKeyChecking no" ubuntu@{ip} {command}' + key_path = OpenstackCloud._get_key_path(runner.name) + exit_code, _, _ = await run_in_unit(unit, f"ls {key_path}") + assert exit_code == 0, f"Unable to find key file {key_path}" + ssh_cmd = f'ssh -i {key_path} -o "StrictHostKeyChecking no" ubuntu@{ip} {command}' ssh_cmd_as_ubuntu_user = f"su - ubuntu -c '{ssh_cmd}'" logging.warning("ssh_cmd: %s", ssh_cmd_as_ubuntu_user) exit_code, stdout, stderr = await run_in_unit(unit, ssh_cmd, timeout) @@ -152,12 +174,14 @@ async def _get_runner_names(self, unit: Unit) -> tuple[str, ...]: Returns: Tuple of runner names. """ - runner = self._get_runner(unit) + runner = self._get_single_runner(unit) assert runner, "Failed to find runner server" return (cast(str, runner.name),) - def _get_runner(self, unit: Unit) -> Server | None: - """Get the runner server. + def _get_single_runner(self, unit: Unit) -> Server | None: + """Get the only runner for the unit. + + This method asserts for exactly one runner for the unit. Args: unit: The unit to get the runner for. @@ -166,14 +190,12 @@ def _get_runner(self, unit: Unit) -> Server | None: The runner server. """ servers: list[Server] = self.openstack_connection.list_servers() - runner = None unit_name_without_slash = unit.name.replace("/", "-") - for server in servers: - if server.name.startswith(unit_name_without_slash): - runner = server - break - - return runner + runners = [server for server in servers if server.name.startswith(unit_name_without_slash)] + assert ( + len(runners) == 1 + ), f"In {unit.name} found more than one runners or no runners: {runners}" + return runners[0] async def setup_repo_policy( @@ -214,6 +236,13 @@ async def setup_repo_policy( await instance_helper.ensure_charm_has_runner(app=app) await instance_helper.expose_to_instance(unit, 8080) + # This tests the connection to the repo policy compliance, not a health check of service. + await instance_helper.run_in_instance( + unit=unit, + command="curl http://localhost:8080", + assert_on_failure=True, + assert_msg="Unable to reach the repo policy compliance server setup", + ) async def _install_repo_policy( @@ -247,7 +276,7 @@ async def _install_repo_policy( ) await run_in_unit( unit, - f'sudo -u ubuntu HTTPS_PROXY={https_proxy if https_proxy else ""} pip install --proxy http://squid.internal:3128 -r /home/ubuntu/repo_policy_compliance/requirements.txt', + f'sudo -u ubuntu HTTPS_PROXY={https_proxy if https_proxy else ""} pip install {f"--proxy {https_proxy}" if https_proxy else ""} -r /home/ubuntu/repo_policy_compliance/requirements.txt', assert_on_failure=True, assert_msg="Failed to install repo-policy-compliance requirements", ) diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py index e3de1600d..6ce23fa0d 100644 --- a/tests/integration/test_charm_metrics_failure.py +++ b/tests/integration/test_charm_metrics_failure.py @@ -10,12 +10,12 @@ import pytest_asyncio from github.Branch import Branch from github.Repository import Repository +from github_runner_manager.metrics import runner_logs +from github_runner_manager.metrics.runner import PostJobStatus from juju.application import Application from juju.model import Model from charm_state import PATH_CONFIG_NAME, VIRTUAL_MACHINES_CONFIG_NAME -from metrics import runner_logs -from metrics.runner import PostJobStatus from tests.integration.helpers.charm_metrics import ( assert_events_after_reconciliation, cancel_workflow_run, diff --git a/tests/integration/test_charm_metrics_success.py b/tests/integration/test_charm_metrics_success.py index c9b7a8dc0..5e8254e5d 100644 --- a/tests/integration/test_charm_metrics_success.py +++ b/tests/integration/test_charm_metrics_success.py @@ -10,11 +10,11 @@ import pytest_asyncio from github.Branch import Branch from github.Repository import Repository +from github_runner_manager.metrics.runner import PostJobStatus from juju.application import Application from juju.model import Model from charm_state import PATH_CONFIG_NAME, VIRTUAL_MACHINES_CONFIG_NAME -from metrics.runner import PostJobStatus from tests.integration.helpers.charm_metrics import ( assert_events_after_reconciliation, clear_metrics_log, diff --git a/tests/integration/test_charm_scheduled_events.py b/tests/integration/test_charm_scheduled_events.py index aa4a9f1b3..5e9819f23 100644 --- a/tests/integration/test_charm_scheduled_events.py +++ b/tests/integration/test_charm_scheduled_events.py @@ -13,7 +13,7 @@ from juju.application import Application from juju.model import Model -from runner_manager import RunnerManager +from runner_manager import LXDRunnerManager from tests.integration.helpers.common import check_runner_binary_exists from tests.integration.helpers.lxd import get_runner_names, run_in_unit, wait_till_num_of_runners from tests.status_name import ACTIVE @@ -40,7 +40,7 @@ async def test_update_interval(model: Model, app_scheduled_events: Application) unit = app_scheduled_events.units[0] assert await check_runner_binary_exists(unit) - ret_code, stdout, stderr = await run_in_unit(unit, f"rm -f {RunnerManager.runner_bin_path}") + ret_code, stdout, stderr = await run_in_unit(unit, f"rm -f {LXDRunnerManager.runner_bin_path}") assert ret_code == 0, f"Failed to remove runner binary {stdout} {stderr}" assert not await check_runner_binary_exists(unit) diff --git a/tests/integration/test_reactive.py b/tests/integration/test_reactive.py index b7445be1f..06dc6e48c 100644 --- a/tests/integration/test_reactive.py +++ b/tests/integration/test_reactive.py @@ -6,14 +6,14 @@ import secrets import pytest +from github_runner_manager.reactive.consumer import JobDetails +from github_runner_manager.reactive.runner_manager import REACTIVE_RUNNER_LOG_DIR from juju.application import Application from juju.model import Model from juju.unit import Unit from kombu import Connection from pytest_operator.plugin import OpsTest -from reactive.consumer import JobDetails -from reactive.runner_manager import REACTIVE_RUNNER_LOG_DIR from tests.integration.helpers.common import get_file_content, reconcile, run_in_unit FAKE_URL = "http://example.com" diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index b20426ca0..cb88d84ba 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -1,5 +1,5 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. """Testing the RunnerManager class with OpenStackRunnerManager as CloudManager.""" @@ -15,19 +15,28 @@ from github.Branch import Branch from github.Repository import Repository from github.Workflow import Workflow -from openstack.connection import Connection as OpenstackConnection - -from charm_state import GithubPath, ProxyConfig, parse_github_path -from manager.cloud_runner_manager import CloudRunnerState, GitHubRunnerConfig, SupportServiceConfig -from manager.github_runner_manager import GitHubRunnerState -from manager.runner_manager import FlushMode, RunnerManager, RunnerManagerConfig -from metrics import events, storage -from openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH -from openstack_cloud.openstack_runner_manager import ( +from github_runner_manager.manager.cloud_runner_manager import ( + CloudRunnerState, + GitHubRunnerConfig, + SupportServiceConfig, +) +from github_runner_manager.manager.github_runner_manager import GitHubRunnerState +from github_runner_manager.manager.runner_manager import ( + FlushMode, + RunnerManager, + RunnerManagerConfig, +) +from github_runner_manager.metrics import events, storage +from github_runner_manager.openstack_cloud.openstack_cloud import _CLOUDS_YAML_PATH +from github_runner_manager.openstack_cloud.openstack_runner_manager import ( OpenStackCloudConfig, - OpenstackRunnerManager, + OpenStackRunnerManager, OpenStackServerConfig, ) +from github_runner_manager.types_.github import GitHubPath, parse_github_path +from openstack.connection import Connection as OpenstackConnection + +from charm_state import ProxyConfig from tests.integration.helpers.common import ( DISPATCH_WAIT_TEST_WORKFLOW_FILENAME, dispatch_workflow, @@ -64,7 +73,7 @@ def log_dir_base_path_fixture( @pytest.fixture(scope="module", name="github_path") -def github_path_fixture(path: str) -> GithubPath: +def github_path_fixture(path: str) -> GitHubPath: return parse_github_path(path, "Default") @@ -92,11 +101,11 @@ async def openstack_runner_manager_fixture( openstack_test_image: str, flavor_name: str, network_name: str, - github_path: GithubPath, + github_path: GitHubPath, proxy_config: ProxyConfig, runner_label: str, openstack_connection: OpenstackConnection, -) -> OpenstackRunnerManager: +) -> OpenStackRunnerManager: """Create OpenstackRunnerManager instance. The prefix args of OpenstackRunnerManager set to app_name to let openstack_connection_fixture @@ -122,19 +131,18 @@ async def openstack_runner_manager_fixture( proxy_config=proxy_config, dockerhub_mirror=None, ssh_debug_connections=None, - repo_policy_url=None, - repo_policy_token=None, + repo_policy_compliance=None, ) - return OpenstackRunnerManager( - app_name, cloud_config, server_config, runner_config, service_config + return OpenStackRunnerManager( + app_name, f"{app_name}-0", cloud_config, server_config, runner_config, service_config ) @pytest_asyncio.fixture(scope="module", name="runner_manager") async def runner_manager_fixture( - openstack_runner_manager: OpenstackRunnerManager, + openstack_runner_manager: OpenStackRunnerManager, token: str, - github_path: GithubPath, + github_path: GitHubPath, log_dir_base_path: dict[str, Path], ) -> RunnerManager: """Get RunnerManager instance. @@ -142,7 +150,7 @@ async def runner_manager_fixture( Import of log_dir_base_path to monkeypatch the runner logs path with tmp_path. """ config = RunnerManagerConfig(token, github_path) - return RunnerManager(openstack_runner_manager, config) + return RunnerManager("test_runner", openstack_runner_manager, config) @pytest_asyncio.fixture(scope="function", name="runner_manager_with_one_runner") @@ -219,7 +227,7 @@ async def test_get_no_runner(runner_manager: RunnerManager) -> None: @pytest.mark.asyncio @pytest.mark.abort_on_fail async def test_runner_normal_idle_lifecycle( - runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager + runner_manager: RunnerManager, openstack_runner_manager: OpenStackRunnerManager ) -> None: """ Arrange: RunnerManager instance with no runners. @@ -397,7 +405,7 @@ async def test_runner_normal_lifecycle( @pytest.mark.asyncio @pytest.mark.abort_on_fail async def test_runner_spawn_two( - runner_manager: RunnerManager, openstack_runner_manager: OpenstackRunnerManager + runner_manager: RunnerManager, openstack_runner_manager: OpenStackRunnerManager ) -> None: """ Arrange: RunnerManager instance with no runners. diff --git a/tests/integration/test_self_hosted_runner.py b/tests/integration/test_self_hosted_runner.py index c91ac8e97..46c8280b1 100644 --- a/tests/integration/test_self_hosted_runner.py +++ b/tests/integration/test_self_hosted_runner.py @@ -9,6 +9,7 @@ import github import pytest from github.Repository import Repository +from github_runner_manager.types_.github import GitHubRepo from juju.application import Application from juju.model import Model @@ -16,7 +17,6 @@ DOCKERHUB_MIRROR_CONFIG_NAME, PATH_CONFIG_NAME, VIRTUAL_MACHINES_CONFIG_NAME, - GithubRepo, ) from github_client import GithubClient from tests.integration.helpers.common import ( @@ -150,7 +150,7 @@ async def test_flush_busy_runner( # Wait until runner online and then busy. for _ in range(30): all_runners = runner_manager_github_client.get_runner_github_info( - GithubRepo( + GitHubRepo( owner=forked_github_repository.owner.login, repo=forked_github_repository.name ) ) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 3ee0259f7..c0b760144 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -8,9 +8,9 @@ from pathlib import Path import pytest +from github_runner_manager.manager.runner_scaler import RunnerScaler import utilities -from openstack_cloud import openstack_manager from tests.unit.mock import MockGhapiClient, MockLxdClient, MockRepoPolicyComplianceClient @@ -46,7 +46,7 @@ def disk_usage_mock(total_disk: int): @pytest.fixture(autouse=True) def mocks(monkeypatch, tmp_path, exec_command, lxd_exec_command, runner_binary_path): - openstack_manager_mock = unittest.mock.MagicMock(spec=openstack_manager) + runner_scaler_mock = unittest.mock.MagicMock(spec=RunnerScaler) cron_path = tmp_path / "cron.d" cron_path.mkdir() @@ -61,7 +61,7 @@ def mocks(monkeypatch, tmp_path, exec_command, lxd_exec_command, runner_binary_p monkeypatch.setattr( "charm.GithubRunnerCharm.repo_check_systemd_service", tmp_path / "systemd_service" ) - monkeypatch.setattr("charm.OpenstackRunnerManager", openstack_manager_mock) + monkeypatch.setattr("charm.RunnerScaler", runner_scaler_mock) monkeypatch.setattr("charm.GithubRunnerCharm.kernel_module_path", tmp_path / "modules") monkeypatch.setattr("charm.GithubRunnerCharm._update_kernel", lambda self, now: None) monkeypatch.setattr("charm.execute_command", exec_command) @@ -76,9 +76,11 @@ def mocks(monkeypatch, tmp_path, exec_command, lxd_exec_command, runner_binary_p monkeypatch.setattr("firewall.Firewall.refresh_firewall", unittest.mock.MagicMock()) monkeypatch.setattr("runner.execute_command", lxd_exec_command) monkeypatch.setattr("runner.shared_fs", unittest.mock.MagicMock()) - monkeypatch.setattr("metrics.events.METRICS_LOG_PATH", Path(tmp_path / "metrics.log")) + monkeypatch.setattr( + "github_runner_manager.metrics.events.METRICS_LOG_PATH", Path(tmp_path / "metrics.log") + ) monkeypatch.setattr("runner.time", unittest.mock.MagicMock()) - monkeypatch.setattr("github_client.GhApi", MockGhapiClient) + monkeypatch.setattr("github_runner_manager.github_client.GhApi", MockGhapiClient) monkeypatch.setattr("runner_manager_type.jinja2", unittest.mock.MagicMock()) monkeypatch.setattr("runner_manager_type.LxdClient", MockLxdClient) monkeypatch.setattr("runner_manager.github_metrics", unittest.mock.MagicMock()) @@ -86,12 +88,12 @@ def mocks(monkeypatch, tmp_path, exec_command, lxd_exec_command, runner_binary_p monkeypatch.setattr("runner_manager.LxdClient", MockLxdClient) monkeypatch.setattr("runner_manager.shared_fs", unittest.mock.MagicMock()) monkeypatch.setattr("runner_manager.execute_command", exec_command) - monkeypatch.setattr("runner_manager.RunnerManager.runner_bin_path", runner_binary_path) - monkeypatch.setattr("runner_manager.RunnerManager.cron_path", cron_path) + monkeypatch.setattr("runner_manager.LXDRunnerManager.runner_bin_path", runner_binary_path) + monkeypatch.setattr("runner_manager.LXDRunnerManager.cron_path", cron_path) monkeypatch.setattr( "runner_manager.RepoPolicyComplianceClient", MockRepoPolicyComplianceClient ) - monkeypatch.setattr("utilities.time", unittest.mock.MagicMock()) + monkeypatch.setattr("github_runner_manager.utilities.time", unittest.mock.MagicMock()) @pytest.fixture(autouse=True, name="cloud_name") @@ -108,7 +110,7 @@ def clouds_yaml_path(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Path: Path: Mocked clouds.yaml path. """ clouds_yaml_path = tmp_path / "clouds.yaml" - monkeypatch.setattr("openstack_cloud.CLOUDS_YAML_PATH", clouds_yaml_path) + monkeypatch.setattr("github_runner_manager.openstack_cloud.CLOUDS_YAML_PATH", clouds_yaml_path) return clouds_yaml_path diff --git a/tests/unit/metrics/__init__.py b/tests/unit/metrics/__init__.py deleted file mode 100644 index 188515554..000000000 --- a/tests/unit/metrics/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. diff --git a/tests/unit/metrics/test_events.py b/tests/unit/metrics/test_events.py deleted file mode 100644 index 195768291..000000000 --- a/tests/unit/metrics/test_events.py +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import json -from pathlib import Path - -from metrics import events - -TEST_LOKI_PUSH_API_URL = "http://loki:3100/api/prom/push" - - -def test_issue_events_logs_events(tmp_path: Path): - """ - arrange: Change path of the events log. - act: Issue a metric event. - assert: The expected metric log is created. - """ - event = events.RunnerInstalled(timestamp=123, flavor="small", duration=456) - - events.issue_event(event) - - assert json.loads(events.METRICS_LOG_PATH.read_text()) == { - "event": "runner_installed", - "timestamp": 123, - "flavor": "small", - "duration": 456, - } - - -def test_issue_events_exclude_none_values(tmp_path: Path): - """ - arrange: Change path of the events log. - act: Issue a metric event with a None value. - assert: The expected metric log without the None value is created. - """ - event = events.RunnerStop( - timestamp=123, - flavor="small", - workflow="workflow", - repo="repo", - github_event="github_event", - status="status", - status_info=None, - job_duration=456, - ) - - events.issue_event(event) - - assert json.loads(events.METRICS_LOG_PATH.read_text()) == { - "event": "runner_stop", - "timestamp": 123, - "flavor": "small", - "workflow": "workflow", - "repo": "repo", - "github_event": "github_event", - "status": "status", - "job_duration": 456, - } diff --git a/tests/unit/metrics/test_github.py b/tests/unit/metrics/test_github.py deleted file mode 100644 index 78a21e4e1..000000000 --- a/tests/unit/metrics/test_github.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import secrets -from datetime import datetime, timedelta, timezone -from random import randint -from unittest.mock import MagicMock - -import pytest - -from errors import GithubMetricsError, JobNotFoundError -from github_client import GithubClient -from github_type import JobConclusion, JobStats -from metrics import github as github_metrics -from metrics.runner import PreJobMetrics - - -@pytest.fixture(name="pre_job_metrics") -def pre_job_metrics_fixture() -> PreJobMetrics: - """Create a PreJobMetrics object.""" - return PreJobMetrics( - repository="owner/repo", - workflow_run_id=1, - workflow="workflow", - job_name="job", - job_started_at=datetime(2021, 10, 1, 1, 0, 0, tzinfo=timezone.utc), - timestamp=1234567890, - event="push", - ) - - -def test_job(pre_job_metrics: PreJobMetrics): - """ - arrange: create a GithubClient mock which returns a GithubJobStats object. - act: Call job. - assert: the job metrics are returned. - """ - github_client = MagicMock(spec=GithubClient) - runner_name = secrets.token_hex(16) - created_at = datetime(2021, 10, 1, 0, 0, 0, tzinfo=timezone.utc) - started_at = created_at + timedelta(seconds=3600) - github_client.get_job_info.return_value = JobStats( - created_at=created_at, - started_at=started_at, - runner_name=runner_name, - conclusion=JobConclusion.SUCCESS, - job_id=randint(1, 1000), - ) - - job_metrics = github_metrics.job( - github_client=github_client, pre_job_metrics=pre_job_metrics, runner_name=runner_name - ) - - assert job_metrics.queue_duration == 3600 - assert job_metrics.conclusion == JobConclusion.SUCCESS - - -def test_job_job_not_found(pre_job_metrics: PreJobMetrics): - """ - arrange: create a GithubClient mock which raises a JobNotFound exception. - act: Call job. - assert: a GithubMetricsError is raised. - """ - github_client = MagicMock(spec=GithubClient) - runner_name = secrets.token_hex(16) - github_client.get_job_info.side_effect = JobNotFoundError("Job not found") - - with pytest.raises(GithubMetricsError): - github_metrics.job( - github_client=github_client, pre_job_metrics=pre_job_metrics, runner_name=runner_name - ) diff --git a/tests/unit/metrics/test_runner.py b/tests/unit/metrics/test_runner.py deleted file mode 100644 index bf0a14251..000000000 --- a/tests/unit/metrics/test_runner.py +++ /dev/null @@ -1,649 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import json -import secrets -from pathlib import Path -from unittest.mock import MagicMock, call - -import pytest - -from errors import DeleteMetricsStorageError, IssueMetricEventError -from github_type import JobConclusion -from metrics import events as metric_events -from metrics import runner as runner_metrics -from metrics import type as metrics_type -from metrics.events import RunnerStart, RunnerStop -from metrics.runner import ( - RUNNER_INSTALLED_TS_FILE_NAME, - PostJobMetrics, - PreJobMetrics, - RunnerMetrics, -) -from metrics.storage import MetricsStorage - - -@pytest.fixture(autouse=True, name="issue_event_mock") -def issue_event_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: - """Mock the issue_event function.""" - issue_event_mock = MagicMock() - monkeypatch.setattr("metrics.events.issue_event", issue_event_mock) - return issue_event_mock - - -@pytest.fixture(name="runner_fs_base") -def runner_fs_base_fixture(tmp_path: Path) -> Path: - """Create a runner filesystem base.""" - runner_fs_base = tmp_path / "runner-fs" - runner_fs_base.mkdir(exist_ok=True) - return runner_fs_base - - -def _create_metrics_data(runner_name: str) -> RunnerMetrics: - """Create a RunnerMetrics object that is suitable for most tests. - - Args: - runner_name: The test runner name. - - Returns: - Test metrics data. - """ - return RunnerMetrics( - installed_timestamp=1, - pre_job=PreJobMetrics( - timestamp=1, - workflow="workflow1", - workflow_run_id="workflow_run_id1", - repository="org1/repository1", - event="push", - ), - post_job=PostJobMetrics(timestamp=3, status=runner_metrics.PostJobStatus.NORMAL), - runner_name=runner_name, - ) - - -def _create_runner_fs_base(tmp_path: Path): - """Create a runner filesystem base. - - Args: - tmp_path: The temporary path to create test runner filesystem under. - - Returns: - The runner filesystem temporary path. - """ - runner_fs_base = tmp_path / "runner-fs" - runner_fs_base.mkdir(exist_ok=True) - return runner_fs_base - - -def _create_runner_files( - runner_fs_base: Path, - runner_name: str, - pre_job_data: str | bytes | None, - post_job_data: str | bytes | None, - installed_timestamp: str | bytes | None, -) -> MetricsStorage: - """Create runner files inside shared fs. - - If the data is bytes, the file is written as binary, otherwise as text. - If data is None, it is not written. - - Args: - runner_fs_base: The base path of the shared fs. - runner_name: The runner name. - pre_job_data: The pre-job metrics data. - post_job_data: The post-job metrics data. - installed_timestamp: The installed timestamp. - - Returns: - A SharedFilesystem instance. - """ - runner_fs = runner_fs_base / runner_name - runner_fs.mkdir() - if pre_job_data: - if isinstance(pre_job_data, bytes): - runner_fs.joinpath(runner_metrics.PRE_JOB_METRICS_FILE_NAME).write_bytes(pre_job_data) - else: - runner_fs.joinpath(runner_metrics.PRE_JOB_METRICS_FILE_NAME).write_text( - pre_job_data, encoding="utf-8" - ) - - if post_job_data: - if isinstance(post_job_data, bytes): - runner_fs.joinpath(runner_metrics.POST_JOB_METRICS_FILE_NAME).write_bytes( - post_job_data - ) - else: - runner_fs.joinpath(runner_metrics.POST_JOB_METRICS_FILE_NAME).write_text( - post_job_data, encoding="utf-8" - ) - - if installed_timestamp: - if isinstance(installed_timestamp, bytes): - runner_fs.joinpath(RUNNER_INSTALLED_TS_FILE_NAME).write_bytes(installed_timestamp) - else: - runner_fs.joinpath(RUNNER_INSTALLED_TS_FILE_NAME).write_text( - installed_timestamp, encoding="utf-8" - ) - return MetricsStorage(path=runner_fs, runner_name=runner_name) - - -def test_extract(runner_fs_base: Path): - """ - arrange: \ - 1. A runner with all metrics inside shared fs. \ - 2. A runner with only pre-job metrics inside shared fs. \ - 3. A runner with no metrics except installed_timestamp inside shared fs. - act: Call extract - assert: All shared filesystems are removed and for runners - 1. + 2. metrics are extracted - 3. no metrics are extracted - """ - runner_all_metrics_name = secrets.token_hex(16) - runner_all_metrics = _create_metrics_data(runner_all_metrics_name) - runner_wihout_post_job_name = secrets.token_hex(16) - runner_without_post_job_metrics = runner_all_metrics.copy() - runner_without_post_job_metrics.post_job = None - runner_without_post_job_metrics.runner_name = runner_wihout_post_job_name - - # 1. Runner has all metrics inside shared fs - runner1_fs = _create_runner_files( - runner_fs_base, - runner_all_metrics_name, - runner_all_metrics.pre_job.json(), - runner_all_metrics.post_job.json(), - str(runner_all_metrics.installed_timestamp), - ) - - # 2. Runner has only pre-job metrics inside shared fs - runner2_fs = _create_runner_files( - runner_fs_base, - runner_wihout_post_job_name, - runner_without_post_job_metrics.pre_job.json(), - None, - str(runner_without_post_job_metrics.installed_timestamp), - ) - - # 3. Runner has no metrics except installed_timestamp inside shared fs - runner3_fs = _create_runner_files(runner_fs_base, secrets.token_hex(16), None, None, "5") - - metrics_storage_manager = MagicMock() - metrics_storage_manager.list_all.return_value = [runner1_fs, runner2_fs, runner3_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - - assert extracted_metrics == [ - runner_all_metrics, - runner_without_post_job_metrics, - ] - metrics_storage_manager.delete.assert_has_calls( - [ - ((runner1_fs.runner_name,),), - ((runner2_fs.runner_name,),), - ((runner3_fs.runner_name,),), - ] - ) - - -def test_extract_ignores_runners(runner_fs_base: Path): - """ - arrange: Runners with metrics. - act: Call extract with some runners on ignore list. - expect: The ignored runners are not processed. - """ - runner_metrics_data = [] - - runner_filesystems = [] - for i in range(5): - runner_name = secrets.token_hex(16) - data = _create_metrics_data(runner_name) - data.pre_job.workflow = f"workflow{i}" - runner_metrics_data.append(data) - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - data.pre_job.json(), - data.post_job.json(), - str(data.installed_timestamp), - ) - runner_filesystems.append(runner_fs) - - metrics_storage_manager = MagicMock() - metrics_storage_manager.list_all.return_value = runner_filesystems - - ignore_runners = {runner_filesystems[0].runner_name, runner_filesystems[2].runner_name} - - extracted_metrics = list( - runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, runners=ignore_runners - ) - ) - - assert extracted_metrics == runner_metrics_data[1:2] + runner_metrics_data[3:] - - -def test_extract_corrupt_data(runner_fs_base: Path, monkeypatch: pytest.MonkeyPatch): - """ - arrange: \ - 1. A runner with non-compliant pre-job metrics inside shared fs. \ - 2. A runner with non-json post-job metrics inside shared fs. \ - 3. A runner with json array post-job metrics inside shared fs. \ - 4. A runner with no real timestamp in installed_timestamp file inside shared fs. - act: Call extract. - assert: No metrics are extracted is issued and shared filesystems are quarantined in all cases. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) - - # 1. Runner has noncompliant pre-job metrics inside shared fs - invalid_pre_job_data = runner_metrics_data.pre_job.copy(update={"timestamp": -1}) - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - invalid_pre_job_data.json(), - runner_metrics_data.post_job.json(), - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager = MagicMock() - metrics_storage_manager.list_all.return_value = [runner_fs] - move_to_quarantine_mock = MagicMock() - monkeypatch.setattr(runner_metrics, "move_to_quarantine", move_to_quarantine_mock) - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - - assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - # 2. Runner has non-json post-job metrics inside shared fs - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - b"\x00", - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - # 3. Runner has json post-job metrics but a json array (not object) inside shared fs. - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - json.dumps([runner_metrics_data.post_job.dict()]), - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - # 4. Runner has not a timestamp in installed_timestamp file inside shared fs - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name=runner_name) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - runner_metrics_data.post_job.json(), - b"\x00", - ) - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - assert not extracted_metrics - - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - -def test_extract_raises_error_for_too_large_files( - runner_fs_base: Path, issue_event_mock: MagicMock, monkeypatch: pytest.MonkeyPatch -): - """ - arrange: Runners with too large metric and timestamp files. - act: Call extract. - assert: No metrics are extracted and shared filesystems is quarantined. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - - # 1. Runner has a pre-job metrics file that is too large - invalid_pre_job_data = runner_metrics_data.pre_job.copy( - update={"workflow": "a" * runner_metrics.FILE_SIZE_BYTES_LIMIT + "b"} - ) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - invalid_pre_job_data.json(), - runner_metrics_data.post_job.json(), - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager = MagicMock() - - metrics_storage_manager.list_all.return_value = [runner_fs] - - move_to_quarantine_mock = MagicMock() - monkeypatch.setattr(runner_metrics, "move_to_quarantine", move_to_quarantine_mock) - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - assert not extracted_metrics - - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - # 2. Runner has a post-job metrics file that is too large - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - invalid_post_job_data = runner_metrics_data.post_job.copy( - update={"status": "a" * runner_metrics.FILE_SIZE_BYTES_LIMIT + "b"} - ) - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - invalid_post_job_data.json(), - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - - assert not extracted_metrics - - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - # 3. Runner has an installed_timestamp file that is too large - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - - invalid_ts = "1" * (runner_metrics.FILE_SIZE_BYTES_LIMIT + 1) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - runner_metrics_data.post_job.json(), - invalid_ts, - ) - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - - assert not extracted_metrics - move_to_quarantine_mock.assert_any_call(metrics_storage_manager, runner_fs.runner_name) - - -def test_extract_ignores_filesystems_without_ts(runner_fs_base: Path): - """ - arrange: A runner without installed_timestamp file inside shared fs. - act: Call extract. - assert: No metrics are extracted and shared filesystem is removed. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = RunnerMetrics.construct( - installed_timestamp=1, - pre_job=PreJobMetrics( - timestamp=1, - workflow="workflow1", - workflow_run_id="workflow_run_id1", - repository="org1/repository1", - event="push", - ), - post_job=PostJobMetrics(timestamp=3, status=runner_metrics.PostJobStatus.NORMAL), - runner_name=runner_name, - ) - - runner_fs = _create_runner_files( - runner_fs_base, - runner_name, - runner_metrics_data.pre_job.json(), - runner_metrics_data.post_job.json(), - None, - ) - metrics_storage_manager = MagicMock() - metrics_storage_manager.list_all.return_value = [runner_fs] - - extracted_metrics = list( - runner_metrics.extract(metrics_storage_manager=metrics_storage_manager, runners=set()) - ) - assert not extracted_metrics - metrics_storage_manager.delete.assert_called_once_with(runner_fs.runner_name) - - -def test_extract_ignores_failure_on_shared_fs_cleanup( - runner_fs_base: Path, - caplog: pytest.LogCaptureFixture, -): - """ - arrange: Mock the shared_fs.delete to raise an exception. - act: Call extract. - assert: The metric is extracted and the exception is caught and logged. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - runner_fs = _create_runner_files( - runner_fs_base, - runner_metrics_data.runner_name, - runner_metrics_data.pre_job.json(), - runner_metrics_data.post_job.json(), - str(runner_metrics_data.installed_timestamp), - ) - metrics_storage_manager = MagicMock() - - metrics_storage_manager.list_all.return_value = [runner_fs] - - metrics_storage_manager.delete.side_effect = DeleteMetricsStorageError( - "Failed to delete shared filesystem" - ) - - extracted_metrics = runner_metrics.extract( - metrics_storage_manager=metrics_storage_manager, runners=set() - ) - assert list(extracted_metrics) == [runner_metrics_data] - - assert "Failed to delete shared filesystem" in caplog.text - - -def test_issue_events(issue_event_mock: MagicMock): - """ - arrange: A runner with all metrics. - act: Call issue_events. - assert: RunnerStart and RunnerStop metrics are issued. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - - flavor = secrets.token_hex(16) - job_metrics = metrics_type.GithubJobMetrics( - queue_duration=3600, conclusion=JobConclusion.SUCCESS - ) - issued_metrics = runner_metrics.issue_events( - runner_metrics=runner_metrics_data, flavor=flavor, job_metrics=job_metrics - ) - assert issued_metrics == {metric_events.RunnerStart, metric_events.RunnerStop} - issue_event_mock.assert_has_calls( - [ - # 1. Runner - call( - RunnerStart( - timestamp=runner_metrics_data.pre_job.timestamp, - flavor=flavor, - workflow=runner_metrics_data.pre_job.workflow, - repo=runner_metrics_data.pre_job.repository, - github_event=runner_metrics_data.pre_job.event, - idle=runner_metrics_data.pre_job.timestamp - - runner_metrics_data.installed_timestamp, - queue_duration=job_metrics.queue_duration, - ) - ), - call( - RunnerStop( - timestamp=runner_metrics_data.post_job.timestamp, - flavor=flavor, - workflow=runner_metrics_data.pre_job.workflow, - repo=runner_metrics_data.pre_job.repository, - github_event=runner_metrics_data.pre_job.event, - status=runner_metrics_data.post_job.status, - job_duration=runner_metrics_data.post_job.timestamp - - runner_metrics_data.pre_job.timestamp, - job_conclusion=job_metrics.conclusion, - ) - ), - ] - ) - - -def test_issue_events_pre_job_before_runner_installed(issue_event_mock: MagicMock): - """ - arrange: A runner with pre-job timestamp smaller than installed timestamp. - act: Call issue_events. - assert: RunnerStart metric is issued with idle set to 0. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - runner_metrics_data.pre_job.timestamp = 0 - - flavor = secrets.token_hex(16) - job_metrics = metrics_type.GithubJobMetrics( - queue_duration=3600, conclusion=JobConclusion.SUCCESS - ) - issued_metrics = runner_metrics.issue_events( - runner_metrics=runner_metrics_data, flavor=flavor, job_metrics=job_metrics - ) - assert metric_events.RunnerStart in issued_metrics - issue_event_mock.assert_has_calls( - [ - call( - RunnerStart( - timestamp=runner_metrics_data.pre_job.timestamp, - flavor=flavor, - workflow=runner_metrics_data.pre_job.workflow, - repo=runner_metrics_data.pre_job.repository, - github_event=runner_metrics_data.pre_job.event, - idle=0, - queue_duration=job_metrics.queue_duration, - ) - ) - ] - ) - - -def test_issue_events_post_job_before_pre_job(issue_event_mock: MagicMock): - """ - arrange: A runner with post-job timestamp smaller than pre-job timestamps. - act: Call issue_events. - assert: job_duration is set to zero. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - runner_metrics_data.post_job = PostJobMetrics( - timestamp=0, status=runner_metrics.PostJobStatus.NORMAL - ) - flavor = secrets.token_hex(16) - job_metrics = metrics_type.GithubJobMetrics( - queue_duration=3600, conclusion=JobConclusion.SUCCESS - ) - issued_metrics = runner_metrics.issue_events( - runner_metrics=runner_metrics_data, flavor=flavor, job_metrics=job_metrics - ) - - assert metric_events.RunnerStop in issued_metrics - issue_event_mock.assert_has_calls( - [ - call( - RunnerStop( - timestamp=runner_metrics_data.post_job.timestamp, - flavor=flavor, - workflow=runner_metrics_data.pre_job.workflow, - repo=runner_metrics_data.pre_job.repository, - github_event=runner_metrics_data.pre_job.event, - status=runner_metrics_data.post_job.status, - job_duration=0, - job_conclusion=job_metrics.conclusion, - ) - ), - ] - ) - - -def test_issue_events_no_post_job_metrics(issue_event_mock: MagicMock): - """ - arrange: A runner without post-job metrics. - act: Call issue_events. - assert: Only RunnerStart metric is issued. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - runner_metrics_data.post_job = None - flavor = secrets.token_hex(16) - job_metrics = metrics_type.GithubJobMetrics( - queue_duration=3600, conclusion=JobConclusion.SUCCESS - ) - issued_metrics = runner_metrics.issue_events( - runner_metrics=runner_metrics_data, flavor=flavor, job_metrics=job_metrics - ) - assert issued_metrics == {metric_events.RunnerStart} - - issue_event_mock.assert_called_once_with( - RunnerStart( - timestamp=runner_metrics_data.pre_job.timestamp, - flavor=flavor, - workflow=runner_metrics_data.pre_job.workflow, - repo=runner_metrics_data.pre_job.repository, - github_event=runner_metrics_data.pre_job.event, - idle=runner_metrics_data.pre_job.timestamp - runner_metrics_data.installed_timestamp, - queue_duration=job_metrics.queue_duration, - ) - ) - - -def test_issue_events_returns_empty_set_on_issue_event_failure( - issue_event_mock: MagicMock, - caplog: pytest.LogCaptureFixture, -): - """ - arrange: Mock the issue_event_mock to raise an exception on the first call. - act: Call issue_events. - assert: No metrics at all are issued. The exception is caught and logged. - """ - runner_name = secrets.token_hex(16) - runner_metrics_data = _create_metrics_data(runner_name) - - issue_event_mock.side_effect = [IssueMetricEventError("Failed to issue metric"), None] - - flavor = secrets.token_hex(16) - job_metrics = metrics_type.GithubJobMetrics( - queue_duration=3600, conclusion=JobConclusion.SUCCESS - ) - - issued_metrics = runner_metrics.issue_events( - runner_metrics=runner_metrics_data, flavor=flavor, job_metrics=job_metrics - ) - assert not issued_metrics - assert "Failed to issue metric" in caplog.text diff --git a/tests/unit/metrics/test_runner_logs.py b/tests/unit/metrics/test_runner_logs.py deleted file mode 100644 index d53dc17cf..000000000 --- a/tests/unit/metrics/test_runner_logs.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -from pathlib import Path - -import pytest - -from metrics import runner_logs - - -@pytest.fixture(name="log_dir_base_path") -def log_dir_base_path_fixture(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: - """Mock the log directory path and return it.""" - log_dir_base_path = tmp_path / "log_dir" - monkeypatch.setattr(runner_logs, "RUNNER_LOGS_DIR_PATH", log_dir_base_path) - return log_dir_base_path - - -def test_remove_outdated_crashed(log_dir_base_path: Path, monkeypatch: pytest.MonkeyPatch): - """ - arrange: Mock the base log directory path. - act: Remove the logs of the runner. - assert: The expected logs are removed. - """ - monkeypatch.setattr(runner_logs, "OUTDATED_LOGS_IN_SECONDS", 0) - - log_dir_path = log_dir_base_path / "test-runner" - log_dir_path.mkdir(parents=True) - - runner_logs.remove_outdated() - - assert not log_dir_path.exists() diff --git a/tests/unit/metrics/test_storage.py b/tests/unit/metrics/test_storage.py deleted file mode 100644 index bc8d0e94c..000000000 --- a/tests/unit/metrics/test_storage.py +++ /dev/null @@ -1,168 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import secrets -import tarfile -from pathlib import Path - -import pytest - -from errors import ( - CreateMetricsStorageError, - DeleteMetricsStorageError, - GetMetricsStorageError, - QuarantineMetricsStorageError, -) -from metrics import storage -from metrics.storage import MetricsStorage - - -@pytest.fixture(autouse=True, name="filesystem_paths") -def filesystem_paths_fixture(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> dict[str, Path]: - """Mock the hardcoded filesystem paths.""" - ms_path = tmp_path / "runner-fs" - ms_quarantine_path = tmp_path / "quarantine" - monkeypatch.setattr(storage, "FILESYSTEM_BASE_PATH", ms_path) - monkeypatch.setattr(storage, "FILESYSTEM_QUARANTINE_PATH", ms_quarantine_path) - return {"base": ms_path, "quarantine": ms_quarantine_path} - - -def test_create_creates_directory(): - """ - arrange: Given a runner name and a path for the storage. - act: Call create. - assert: The directory is created. - """ - runner_name = secrets.token_hex(16) - - fs = storage.create(runner_name) - - assert fs.path.exists() - assert fs.path.is_dir() - - -def test_create_raises_exception_if_already_exists(): - """ - arrange: Given a runner name and an already existing shared filesystem. - act: Call create. - assert: The expected exception is raised. - """ - runner_name = secrets.token_hex(16) - storage.create(runner_name) - - with pytest.raises(CreateMetricsStorageError): - storage.create(runner_name) - - -def test_list_all(): - """ - arrange: Create metric storages for multiple runners. - act: Call list_all. - assert: A generator listing all the shared filesystems is returned. - """ - runner_names = [secrets.token_hex(16) for _ in range(3)] - for runner_name in runner_names: - storage.create(runner_name) - - fs_list = list(storage.list_all()) - - assert len(fs_list) == 3 - for fs in fs_list: - assert isinstance(fs, storage.MetricsStorage) - assert fs.runner_name in runner_names - - -def test_list_all_empty(): - """ - arrange: Nothing. - act: Call list_all. - assert: An empty iterator is returned. - """ - fs_list = list(storage.list_all()) - - assert len(fs_list) == 0 - - -def test_delete(): - """ - arrange: Create metrics storage for a runner. - act: Call delete - assert: The storage is deleted. - """ - runner_name = secrets.token_hex(16) - storage.create(runner_name) - - storage.delete(runner_name) - - with pytest.raises(GetMetricsStorageError): - storage.get(runner_name) - - -def test_delete_raises_error(): - """ - arrange: Nothing. - act: Call delete. - assert: A DeleteMetricsStorageError is raised. - """ - runner_name = secrets.token_hex(16) - - with pytest.raises(DeleteMetricsStorageError): - storage.delete(runner_name) - - -def test_get(): - """ - arrange: Given a runner name. - act: Call create and get. - assert: A metrics storage object for this runner is returned. - """ - runner_name = secrets.token_hex(16) - - storage.create(runner_name) - ms = storage.get(runner_name) - - assert isinstance(ms, MetricsStorage) - assert ms.runner_name == runner_name - - -def test_get_raises_error_if_not_found(): - """ - arrange: Nothing. - act: Call get. - assert: A GetMetricsStorageError is raised. - """ - runner_name = secrets.token_hex(16) - - with pytest.raises(GetMetricsStorageError): - storage.get(runner_name) - - -def test_quarantine(filesystem_paths: dict[str, Path], tmp_path: Path): - """ - arrange: Create a storage for a runner with a file in it. - act: Call quarantine. - assert: The storage is moved to the quarantine. - """ - runner_name = secrets.token_hex(16) - ms = storage.create(runner_name) - ms.path.joinpath("test.txt").write_text("foo bar") - - storage.move_to_quarantine(storage, runner_name) - - tarfile_path = filesystem_paths["quarantine"].joinpath(runner_name).with_suffix(".tar.gz") - assert tarfile_path.exists() - tarfile.open(tarfile_path).extractall(path=tmp_path) - assert tmp_path.joinpath(f"{runner_name}/test.txt").exists() - assert tmp_path.joinpath(f"{runner_name}/test.txt").read_text(encoding="utf-8") == "foo bar" - assert not ms.path.exists() - - -def test_quarantine_raises_error(): - """ - arrange: Nothing. - act: Call quarantine. - assert: A QuarantineMetricsStorageError is raised. - """ - runner_name = secrets.token_hex(16) - - with pytest.raises(QuarantineMetricsStorageError): - storage.move_to_quarantine(storage, runner_name) diff --git a/tests/unit/mock.py b/tests/unit/mock.py index be3e07ca7..78c0c6990 100644 --- a/tests/unit/mock.py +++ b/tests/unit/mock.py @@ -12,8 +12,9 @@ from pathlib import Path from typing import IO, Optional, Sequence, Union +from github_runner_manager.types_.github import RegistrationToken, RemoveToken, RunnerApplication + from errors import LxdError, RunnerError -from github_type import RegistrationToken, RemoveToken, RunnerApplication from lxd_type import LxdNetwork from runner import LxdInstanceConfig diff --git a/tests/unit/mock_runner_managers.py b/tests/unit/mock_runner_managers.py new file mode 100644 index 000000000..b52afa538 --- /dev/null +++ b/tests/unit/mock_runner_managers.py @@ -0,0 +1,295 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +import random +import secrets +from dataclasses import dataclass +from typing import Iterable, Iterator, Sequence +from unittest.mock import MagicMock + +from github_runner_manager.manager.cloud_runner_manager import ( + CloudRunnerInstance, + CloudRunnerManager, + CloudRunnerState, + InstanceId, +) +from github_runner_manager.manager.github_runner_manager import GitHubRunnerState +from github_runner_manager.metrics.runner import RunnerMetrics +from github_runner_manager.types_.github import GitHubRunnerStatus, SelfHostedRunner + +from charm_state import GitHubPath +from github_client import GithubClient +from tests.unit.mock import MockGhapiClient + + +@dataclass +class MockRunner: + """Mock of a runner. + + Attributes: + name: The name of the runner. + instance_id: The instance id of the runner. + cloud_state: The cloud state of the runner. + github_state: The github state of the runner. + health: The health state of the runner. + """ + + name: str + instance_id: InstanceId + cloud_state: CloudRunnerState + github_state: GitHubRunnerState + health: bool + + def __init__(self, name: str): + """Construct the object. + + Args: + name: The name of the runner. + """ + self.name = name + self.instance_id = secrets.token_hex(6) + self.cloud_state = CloudRunnerState.ACTIVE + self.github_state = GitHubRunnerState.IDLE + self.health = True + + def to_cloud_runner(self) -> CloudRunnerInstance: + """Construct CloudRunnerInstance from this object. + + Returns: + The CloudRunnerInstance instance. + """ + return CloudRunnerInstance( + name=self.name, + instance_id=self.instance_id, + health=self.health, + state=self.cloud_state, + ) + + +@dataclass +class SharedMockRunnerManagerState: + """State shared by mock runner managers. + + For sharing the mock runner states between MockCloudRunnerManager and MockGitHubRunnerManager. + + Attributes: + runners: The runners. + """ + + runners: dict[InstanceId, MockRunner] + + def __init__(self): + """Construct the object.""" + self.runners = {} + + +class MockCloudRunnerManager(CloudRunnerManager): + """Mock of CloudRunnerManager. + + Metrics is not supported in this mock. + + Attributes: + name_prefix: The naming prefix for runners managed. + prefix: The naming prefix for runners managed. + state: The shared state between mocks runner managers. + """ + + def __init__(self, state: SharedMockRunnerManagerState): + """Construct the object. + + Args: + state: The shared state between cloud and github runner managers. + """ + self.prefix = f"mock_{secrets.token_hex(4)}" + self.state = state + + @property + def name_prefix(self) -> str: + """Get the name prefix of the self-hosted runners.""" + return self.prefix + + def create_runner(self, registration_token: str) -> InstanceId: + """Create a self-hosted runner. + + Args: + registration_token: The GitHub registration token for registering runners. + + Returns: + The instance id of the runner created. + """ + name = f"{self.name_prefix}-{secrets.token_hex(6)}" + runner = MockRunner(name) + self.state.runners[runner.instance_id] = runner + return runner.instance_id + + def get_runner(self, instance_id: InstanceId) -> CloudRunnerInstance | None: + """Get a self-hosted runner by instance id. + + Args: + instance_id: The instance id. + + Returns: + The runner instance if found else None. + """ + runner = self.state.runners.get(instance_id, None) + if runner is not None: + return runner.to_cloud_runner() + return None + + def get_runners( + self, states: Sequence[CloudRunnerState] | None = None + ) -> tuple[CloudRunnerInstance, ...]: + """Get self-hosted runners by state. + + Args: + states: Filter for the runners with these github states. If None all states will be + included. + + Returns: + The list of runner instances. + """ + if states is None: + states = [member.value for member in CloudRunnerState] + + state_set = set(states) + return tuple( + runner.to_cloud_runner() + for runner in self.state.runners.values() + if runner.cloud_state in state_set + ) + + def delete_runner(self, instance_id: InstanceId, remove_token: str) -> RunnerMetrics | None: + """Delete self-hosted runner. + + Args: + instance_id: The instance id of the runner to delete. + remove_token: The GitHub remove token. + + Returns: + Any runner metrics produced during deletion. + """ + runner = self.state.runners.pop(instance_id, None) + if runner is not None: + return iter([MagicMock()]) + return iter([]) + + def flush_runners(self, remove_token: str, busy: bool = False) -> Iterator[RunnerMetrics]: + """Stop all runners. + + Args: + remove_token: The GitHub remove token for removing runners. + busy: If false, only idle runners are removed. If true, both idle and busy runners are + removed. + + Returns: + Any runner metrics produced during flushing. + """ + if busy: + self.state.runners = {} + else: + self.state.runners = { + instance_id: runner + for instance_id, runner in self.state.runners.items() + if runner.github_state == GitHubRunnerState.BUSY + } + return iter([MagicMock()]) + + def cleanup(self, remove_token: str) -> Iterator[RunnerMetrics]: + """Cleanup runner and resource on the cloud. + + Perform health check on runner and delete the runner if it fails. + + Args: + remove_token: The GitHub remove token for removing runners. + + Returns: + Any runner metrics produced during cleanup. + """ + # Do nothing in mocks. + return iter([MagicMock()]) + + +class MockGitHubRunnerManager: + """Mock of GitHubRunnerManager. + + Attributes: + github: The GitHub client. + name_prefix: The naming prefix for runner managed. + state: The shared state between mock runner managers. + path: The GitHub path to register the runners under. + """ + + def __init__(self, name_prefix: str, path: GitHubPath, state: SharedMockRunnerManagerState): + """Construct the object. + + Args: + name_prefix: The naming prefix for runner managed. + path: The GitHub path to register the runners under. + state: The shared state between mock runner managers. + """ + self.github = GithubClient("mock_token") + self.github._client = MockGhapiClient("mock_token") + self.name_prefix = name_prefix + self.state = state + self.path = path + + def get_registration_token(self) -> str: + """Get the registration token for registering runners on GitHub. + + Returns: + The registration token. + """ + return "mock_registration_token" + + def get_removal_token(self) -> str: + """Get the remove token for removing runners on GitHub. + + Returns: + The remove token. + """ + return "mock_remove_token" + + def get_runners( + self, github_states: Iterable[GitHubRunnerState] | None = None + ) -> tuple[SelfHostedRunner, ...]: + """Get the runners. + + Args: + github_states: The states to filter for. + + Returns: + List of runners. + """ + if github_states is None: + github_states = [member.value for member in GitHubRunnerState] + + github_state_set = set(github_states) + return tuple( + SelfHostedRunner( + busy=runner.github_state == GitHubRunnerState.BUSY, + id=random.randint(1, 1000000), + labels=[], + os="linux", + name=runner.name, + status=( + GitHubRunnerStatus.OFFLINE + if runner.github_state == GitHubRunnerState.OFFLINE + else GitHubRunnerStatus.ONLINE + ), + ) + for runner in self.state.runners.values() + if runner.github_state in github_state_set + ) + + def delete_runners(self, states: Iterable[GitHubRunnerState]) -> None: + """Delete the runners. + + Args: + states: The states to filter the runners to delete. + """ + github_states = set(states) + self.state.runners = { + instance_id: runner + for instance_id, runner in self.state.runners.items() + if runner.github_state not in github_states + } diff --git a/tests/unit/reactive/__init__.py b/tests/unit/reactive/__init__.py deleted file mode 100644 index 188515554..000000000 --- a/tests/unit/reactive/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. diff --git a/tests/unit/reactive/test_consumer.py b/tests/unit/reactive/test_consumer.py deleted file mode 100644 index 2a443c9b3..000000000 --- a/tests/unit/reactive/test_consumer.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. - -import secrets -from contextlib import closing - -import pytest -from kombu import Connection, Message - -from reactive import consumer -from reactive.consumer import JobError - -IN_MEMORY_URI = "memory://" -FAKE_RUN_URL = "https://api.github.com/repos/fakeusergh-runner-test/actions/runs/8200803099" - - -def test_consume(caplog: pytest.LogCaptureFixture): - """ - arrange: A job placed in the message queue. - act: Call consume - assert: The job is logged. - """ - queue_name = secrets.token_hex(16) - job_details = consumer.JobDetails( - labels=[secrets.token_hex(16), secrets.token_hex(16)], - run_url=FAKE_RUN_URL, - ) - _put_in_queue(job_details.json(), queue_name) - - # we use construct to avoid pydantic validation as IN_MEMORY_URI is not a valid URL - consumer.consume(IN_MEMORY_URI, queue_name) - assert str(job_details.labels) in caplog.text - assert job_details.run_url in caplog.text - - -@pytest.mark.parametrize( - "job_str", - [ - pytest.param( - '{"labels": ["label1", "label2"], "status": "completed"}', id="run_url missing" - ), - pytest.param( - '{"status": "completed", "run_url": "https://example.com"}', id="labels missing" - ), - pytest.param("no json at all", id="invalid json"), - ], -) -def test_job_details_validation_error(job_str: str): - """ - arrange: A job placed in the message queue with invalid details. - act: Call consume - assert: A JobError is raised and the message is requeued. - """ - queue_name = secrets.token_hex(16) - _put_in_queue(job_str, queue_name) - - with pytest.raises(JobError) as exc_info: - consumer.consume(IN_MEMORY_URI, queue_name) - assert "Invalid job details" in str(exc_info.value) - - # Ensure message has been requeued by reconsuming it - msg = _consume_from_queue(queue_name) - assert msg.payload == job_str - - -def _put_in_queue(msg: str, queue_name: str) -> None: - """Put a job in the message queue. - - Args: - msg: The job details. - queue_name: The name of the queue - """ - with Connection(IN_MEMORY_URI) as conn: - with closing(conn.SimpleQueue(queue_name)) as simple_queue: - simple_queue.put(msg, retry=True) - - -def _consume_from_queue(queue_name: str) -> Message: - """Consume a job from the message queue. - - Args: - queue_name: The name of the queue - - Returns: - The message consumed from the queue. - """ - with Connection(IN_MEMORY_URI) as conn: - with closing(conn.SimpleQueue(queue_name)) as simple_queue: - return simple_queue.get(block=False) diff --git a/tests/unit/reactive/test_runner_manager.py b/tests/unit/reactive/test_runner_manager.py deleted file mode 100644 index cd25cf728..000000000 --- a/tests/unit/reactive/test_runner_manager.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import os -import secrets -import subprocess -from pathlib import Path -from subprocess import CompletedProcess -from unittest.mock import MagicMock - -import pytest - -from reactive.runner_manager import ( - PIDS_COMMAND_LINE, - PYTHON_BIN, - REACTIVE_RUNNER_SCRIPT_FILE, - ReactiveRunnerError, - reconcile, -) -from utilities import secure_run_subprocess - -EXAMPLE_MQ_URI = "http://example.com" - - -@pytest.fixture(name="log_dir", autouse=True) -def log_dir_path_fixture(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: - """Return the path to the log file.""" - log_file_path = tmp_path / "logs" - monkeypatch.setattr("reactive.runner_manager.REACTIVE_RUNNER_LOG_DIR", log_file_path) - monkeypatch.setattr("shutil.chown", lambda *args, **kwargs: None) - return log_file_path - - -@pytest.fixture(name="secure_run_subprocess_mock") -def secure_run_subprocess_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: - """Mock the ps command.""" - secure_run_subprocess_mock = MagicMock(spec=secure_run_subprocess) - monkeypatch.setattr( - "reactive.runner_manager.secure_run_subprocess", secure_run_subprocess_mock - ) - return secure_run_subprocess_mock - - -@pytest.fixture(name="os_kill_mock", autouse=True) -def os_kill_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: - """Mock the os.kill function.""" - os_kill_mock = MagicMock(spec=os.kill) - monkeypatch.setattr("os.kill", os_kill_mock) - return os_kill_mock - - -@pytest.fixture(name="subprocess_popen_mock") -def subprocess_popen_mock_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: - """Mock the subprocess.Popen function.""" - popen_result = MagicMock(spec=subprocess.Popen, pid=1234, returncode=0) - subprocess_popen_mock = MagicMock( - spec=subprocess.Popen, - return_value=popen_result, - ) - monkeypatch.setattr("subprocess.Popen", subprocess_popen_mock) - return subprocess_popen_mock - - -def test_reconcile_spawns_runners( - secure_run_subprocess_mock: MagicMock, subprocess_popen_mock: MagicMock, log_dir: Path -): - """ - arrange: Mock that two reactive runner processes are active. - act: Call reconcile with a quantity of 5. - assert: Three runners are spawned. Log file is setup. - """ - queue_name = secrets.token_hex(16) - _arrange_reactive_processes(secure_run_subprocess_mock, count=2) - - delta = reconcile(5, mq_uri=EXAMPLE_MQ_URI, queue_name=queue_name) - - assert delta == 3 - assert subprocess_popen_mock.call_count == 3 - assert log_dir.exists() - - -def test_reconcile_does_not_spawn_runners( - secure_run_subprocess_mock: MagicMock, subprocess_popen_mock: MagicMock -): - """ - arrange: Mock that two reactive runner processes are active. - act: Call reconcile with a quantity of 2. - assert: No runners are spawned. - """ - queue_name = secrets.token_hex(16) - _arrange_reactive_processes(secure_run_subprocess_mock, count=2) - - delta = reconcile(2, mq_uri=EXAMPLE_MQ_URI, queue_name=queue_name) - - assert delta == 0 - assert subprocess_popen_mock.call_count == 0 - - -def test_reconcile_kills_processes_for_too_many_processes( - secure_run_subprocess_mock: MagicMock, - subprocess_popen_mock: MagicMock, - os_kill_mock: MagicMock, -): - """ - arrange: Mock that 3 reactive runner processes are active. - act: Call reconcile with a quantity of 1. - assert: 2 processes are killed. - """ - queue_name = secrets.token_hex(16) - _arrange_reactive_processes(secure_run_subprocess_mock, count=3) - delta = reconcile(1, mq_uri=EXAMPLE_MQ_URI, queue_name=queue_name) - - assert delta == -2 - assert subprocess_popen_mock.call_count == 0 - assert os_kill_mock.call_count == 2 - - -def test_reconcile_ignore_process_not_found_on_kill( - secure_run_subprocess_mock: MagicMock, - subprocess_popen_mock: MagicMock, - os_kill_mock: MagicMock, -): - """ - arrange: Mock 3 reactive processes and os.kill to fail once with a ProcessLookupError. - act: Call reconcile with a quantity of 1. - assert: The returned delta is still -2. - """ - queue_name = secrets.token_hex(16) - _arrange_reactive_processes(secure_run_subprocess_mock, count=3) - os_kill_mock.side_effect = [None, ProcessLookupError] - delta = reconcile(1, mq_uri=EXAMPLE_MQ_URI, queue_name=queue_name) - - assert delta == -2 - assert subprocess_popen_mock.call_count == 0 - assert os_kill_mock.call_count == 2 - - -def test_reconcile_raises_reactive_runner_error_on_ps_failure( - secure_run_subprocess_mock: MagicMock, -): - """ - arrange: Mock that the ps command fails. - act: Call reconcile with a quantity of 1. - assert: A ReactiveRunnerError is raised. - """ - queue_name = secrets.token_hex(16) - secure_run_subprocess_mock.return_value = CompletedProcess( - args=PIDS_COMMAND_LINE, - returncode=1, - stdout=b"", - stderr=b"error", - ) - - with pytest.raises(ReactiveRunnerError) as err: - reconcile(1, mq_uri=EXAMPLE_MQ_URI, queue_name=queue_name) - - assert "Failed to get list of processes" in str(err.value) - - -def _arrange_reactive_processes(secure_run_subprocess_mock: MagicMock, count: int): - """Mock reactive runner processes are active. - - Args: - secure_run_subprocess_mock: The mock to use for the ps command. - count: The number of processes. - """ - process_cmds_before = "\n".join( - [f"{PYTHON_BIN} {REACTIVE_RUNNER_SCRIPT_FILE}\t{i}" for i in range(count)] - ) - - secure_run_subprocess_mock.return_value = CompletedProcess( - args=PIDS_COMMAND_LINE, - returncode=0, - stdout=f"CMD\n{process_cmds_before}".encode("utf-8"), - stderr=b"", - ) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 8b19a7797..a28fc9743 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -12,6 +12,7 @@ import pytest import yaml +from github_runner_manager.types_.github import GitHubOrg, GitHubRepo, GitHubRunnerStatus from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, StatusBase, WaitingStatus from ops.testing import Harness @@ -28,8 +29,6 @@ VM_CPU_CONFIG_NAME, VM_DISK_CONFIG_NAME, Arch, - GithubOrg, - GithubRepo, InstanceType, OpenStackCloudsYAML, OpenstackImage, @@ -41,15 +40,13 @@ LogrotateSetupError, MissingMongoDBError, MissingRunnerBinaryError, - OpenStackUnauthorizedError, RunnerError, SubprocessError, TokenError, ) from event_timer import EventTimer, TimerEnableError from firewall import FirewallEntry -from github_type import GitHubRunnerStatus -from runner_manager import RunnerInfo, RunnerManagerConfig +from runner_manager import LXDRunnerManagerConfig, RunnerInfo TEST_PROXY_SERVER_URL = "http://proxy.server:1234" @@ -158,8 +155,10 @@ def stub_update_runner_bin(*args, **kwargs) -> None: harness = Harness(GithubRunnerCharm) harness.update_config({PATH_CONFIG_NAME: "mock/repo", TOKEN_CONFIG_NAME: "mocktoken"}) harness.begin() - monkeypatch.setattr("runner_manager.RunnerManager.update_runner_bin", stub_update_runner_bin) - monkeypatch.setattr("runner_manager.RunnerManager._runners_in_pre_job", lambda self: False) + monkeypatch.setattr( + "runner_manager.LXDRunnerManager.update_runner_bin", stub_update_runner_bin + ) + monkeypatch.setattr("runner_manager.LXDRunnerManager._runners_in_pre_job", lambda self: False) monkeypatch.setattr("charm.EventTimer.ensure_event_timer", MagicMock()) monkeypatch.setattr("charm.logrotate.setup", MagicMock()) return harness @@ -208,7 +207,7 @@ def test_common_install_code( monkeypatch.setattr("charm.logrotate.setup", setup_logrotate := MagicMock()) monkeypatch.setattr( - "runner_manager.RunnerManager.schedule_build_runner_image", + "runner_manager.LXDRunnerManager.schedule_build_runner_image", schedule_build_runner_image := MagicMock(), ) event_timer_mock = MagicMock(spec=EventTimer) @@ -243,11 +242,11 @@ def test_common_install_code_does_not_rebuild_image( assert: Image is not rebuilt. """ monkeypatch.setattr( - "runner_manager.RunnerManager.build_runner_image", + "runner_manager.LXDRunnerManager.build_runner_image", build_runner_image := MagicMock(), ) monkeypatch.setattr( - "runner_manager.RunnerManager.has_runner_image", + "runner_manager.LXDRunnerManager.has_runner_image", MagicMock(return_value=True), ) getattr(harness.charm.on, hook).emit() @@ -439,7 +438,7 @@ def test_database_integration_events_trigger_reconciliation( class TestCharm(unittest.TestCase): """Test the GithubRunner charm.""" - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -460,8 +459,8 @@ def test_org_register(self, run, wt, mkdir, rm): rm.assert_called_with( "github-runner", "0", - RunnerManagerConfig( - path=GithubOrg(org="mockorg", group="mockgroup"), + LXDRunnerManagerConfig( + path=GitHubOrg(org="mockorg", group="mockgroup"), token="mocktoken", image="jammy", service_token=token, @@ -470,7 +469,7 @@ def test_org_register(self, run, wt, mkdir, rm): ), ) - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -490,8 +489,8 @@ def test_repo_register(self, run, wt, mkdir, rm): rm.assert_called_with( "github-runner", "0", - RunnerManagerConfig( - path=GithubRepo(owner="mockorg", repo="repo"), + LXDRunnerManagerConfig( + path=GitHubRepo(owner="mockorg", repo="repo"), token="mocktoken", image="jammy", service_token=token, @@ -500,7 +499,7 @@ def test_repo_register(self, run, wt, mkdir, rm): ), ) - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -527,7 +526,7 @@ def test_exceed_free_disk_size(self, run, wt, mkdir, rm): ) ) - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -548,8 +547,8 @@ def test_update_config(self, run, wt, mkdir, rm): rm.assert_called_with( "github-runner", "0", - RunnerManagerConfig( - path=GithubRepo(owner="mockorg", repo="repo"), + LXDRunnerManagerConfig( + path=GitHubRepo(owner="mockorg", repo="repo"), token="mocktoken", image="jammy", service_token=token, @@ -570,8 +569,8 @@ def test_update_config(self, run, wt, mkdir, rm): rm.assert_called_with( "github-runner", "0", - RunnerManagerConfig( - path=GithubRepo(owner="mockorg", repo="repo"), + LXDRunnerManagerConfig( + path=GitHubRepo(owner="mockorg", repo="repo"), token="mocktoken", image="jammy", service_token=token, @@ -584,7 +583,7 @@ def test_update_config(self, run, wt, mkdir, rm): ) mock_rm.reset_mock() - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -629,7 +628,7 @@ def test_on_update_status(self, run, wt, mkdir, rm): with pytest.raises(TimerEnableError): harness.charm.on.update_status.emit() - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -641,7 +640,7 @@ def test_on_stop(self, run, wt, mkdir, rm): harness.charm.on.stop.emit() mock_rm.flush.assert_called() - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -660,8 +659,8 @@ def test_on_start_failure(self, run, wt, mkdir, rm): "Failed to start runners: mock error" ) - @patch("charm.RunnerManager") - @patch("charm.OpenstackRunnerManager") + @patch("charm.LXDRunnerManager") + @patch("charm.RunnerScaler") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -700,7 +699,7 @@ def test_on_config_changed_openstack_clouds_yaml(self, run, wt, mkdir, orm, rm): assert harness.charm.unit.status == BlockedStatus("Please provide image integration.") - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -719,7 +718,7 @@ def test_check_runners_action(self, run, wt, mkdir, rm): {"online": 2, "offline": 2, "unknown": 1, "runners": "test runner 0, test runner 1"} ) - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -733,7 +732,7 @@ def test_check_runners_action_with_errors(self, run, wt, mkdir, rm): harness.charm._on_check_runners_action(mock_event) mock_event.fail.assert_called_with("Invalid Github config, Missing path configuration") - @patch("charm.RunnerManager") + @patch("charm.LXDRunnerManager") @patch("pathlib.Path.mkdir") @patch("pathlib.Path.write_text") @patch("subprocess.run") @@ -759,7 +758,6 @@ def test_on_flush_runners_action(self, run, wt, mkdir, rm): pytest.param(ConfigurationError, BlockedStatus, id="charm config error"), pytest.param(TokenError, BlockedStatus, id="github token error"), pytest.param(MissingRunnerBinaryError, MaintenanceStatus, id="runner binary error"), - pytest.param(OpenStackUnauthorizedError, BlockedStatus, id="openstack auth error"), ], ) def test_catch_charm_errors( @@ -936,7 +934,7 @@ def test__on_image_relation_image_ready(): harness.charm._setup_state = MagicMock(return_value=state_mock) harness.charm._get_set_image_ready_status = MagicMock(return_value=True) runner_manager_mock = MagicMock() - harness.charm._get_openstack_runner_manager = MagicMock(return_value=runner_manager_mock) + harness.charm._get_runner_scaler = MagicMock(return_value=runner_manager_mock) harness.charm._on_image_relation_changed(MagicMock()) diff --git a/tests/unit/test_charm_state.py b/tests/unit/test_charm_state.py index 8479782a8..b7df8a5dc 100644 --- a/tests/unit/test_charm_state.py +++ b/tests/unit/test_charm_state.py @@ -8,20 +8,22 @@ from pathlib import Path from unittest.mock import MagicMock +import github_runner_manager.openstack_cloud import pytest import yaml from charms.data_platform_libs.v0.data_interfaces import DatabaseRequires +from github_runner_manager.types_.github import GitHubOrg, GitHubRepo from pydantic import BaseModel from pydantic.error_wrappers import ValidationError from pydantic.networks import IPv4Address import charm_state -import openstack_cloud from charm_state import ( BASE_IMAGE_CONFIG_NAME, DEBUG_SSH_INTEGRATION_NAME, DENYLIST_CONFIG_NAME, DOCKERHUB_MIRROR_CONFIG_NAME, + GROUP_CONFIG_NAME, IMAGE_INTEGRATION_NAME, LABELS_CONFIG_NAME, OPENSTACK_CLOUDS_YAML_CONFIG_NAME, @@ -41,8 +43,6 @@ CharmState, FirewallEntry, GithubConfig, - GithubOrg, - GithubRepo, ImmutableConfigChangedError, LocalLxdRunnerConfig, OpenstackImage, @@ -65,7 +65,7 @@ def test_github_repo_path(): """ owner = "test_owner" repo = "test_repo" - github_repo = GithubRepo(owner, repo) + github_repo = GitHubRepo(owner, repo) path = github_repo.path() @@ -80,27 +80,28 @@ def test_github_org_path(): """ org = "test_org" group = "test_group" - github_org = GithubOrg(org, group) + github_org = GitHubOrg(org, group) path = github_org.path() assert path == org -def test_parse_github_path_invalid(): +def test_github_config_from_charm_invalud_path(): """ arrange: Create an invalid GitHub path string and runner group name. act: Call parse_github_path with the invalid path string and runner group name. assert: Verify that the function raises CharmConfigInvalidError. """ - path_str = "invalidpath/" - runner_group = "test_group" + mock_charm = MockGithubRunnerCharmFactory() + mock_charm.config[PATH_CONFIG_NAME] = "invalidpath/" + mock_charm.config[GROUP_CONFIG_NAME] = "test_group" with pytest.raises(CharmConfigInvalidError): - charm_state.parse_github_path(path_str, runner_group) + GithubConfig.from_charm(mock_charm) -def test_github_config_from_charm_invalid_path(): +def test_github_config_from_charm_empty_path(): """ arrange: Create a mock CharmBase instance with an empty path configuration. act: Call from_charm method with the mock CharmBase instance. @@ -129,14 +130,14 @@ def test_github_config_from_charm_invalid_token(): @pytest.mark.parametrize( "path_str, runner_group, expected_type, expected_attrs", [ - ("owner/repo", "test_group", GithubRepo, {"owner": "owner", "repo": "repo"}), - ("test_org", "test_group", GithubOrg, {"org": "test_org", "group": "test_group"}), + ("owner/repo", "test_group", GitHubRepo, {"owner": "owner", "repo": "repo"}), + ("test_org", "test_group", GitHubOrg, {"org": "test_org", "group": "test_group"}), ], ) def test_parse_github_path( path_str: str, runner_group: str, - expected_type: GithubRepo | GithubOrg, + expected_type: GitHubRepo | GitHubOrg, expected_attrs: dict[str, str], ): """ @@ -367,9 +368,9 @@ def test_parse_openstack_clouds_initialize_fail( mock_charm = MockGithubRunnerCharmFactory() mock_charm.config[OPENSTACK_CLOUDS_YAML_CONFIG_NAME] = valid_yaml_config monkeypatch.setattr( - openstack_cloud, + github_runner_manager.openstack_cloud, "initialize", - MagicMock(side_effect=openstack_cloud.OpenStackInvalidConfigError), + MagicMock(side_effect=github_runner_manager.openstack_cloud.OpenStackInvalidConfigError), ) with pytest.raises(CharmConfigInvalidError): @@ -498,7 +499,7 @@ def test_charm_config_from_charm_valid(): result = CharmConfig.from_charm(mock_charm) - assert result.path == GithubRepo(owner="owner", repo="repo") + assert result.path == GitHubRepo(owner="owner", repo="repo") assert result.reconcile_interval == 5 assert result.denylist == [ FirewallEntry(ip_range="192.168.1.1"), diff --git a/tests/unit/test_github_client.py b/tests/unit/test_github_client.py deleted file mode 100644 index b01a75a01..000000000 --- a/tests/unit/test_github_client.py +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import http -import random -import secrets -from collections import namedtuple -from datetime import datetime, timezone -from unittest.mock import MagicMock -from urllib.error import HTTPError - -import pytest - -from charm_state import GithubRepo -from errors import JobNotFoundError -from github_client import GithubClient -from github_type import JobConclusion, JobStats - -JobStatsRawData = namedtuple( - "JobStatsRawData", - ["created_at", "started_at", "runner_name", "conclusion", "id"], -) - - -@pytest.fixture(name="job_stats_raw") -def job_stats_fixture() -> JobStatsRawData: - """Create a JobStats object.""" - runner_name = secrets.token_hex(16) - return JobStatsRawData( - created_at="2021-10-01T00:00:00Z", - started_at="2021-10-01T01:00:00Z", - conclusion="success", - runner_name=runner_name, - id=random.randint(1, 1000), - ) - - -@pytest.fixture(name="github_client") -def github_client_fixture(job_stats_raw: JobStatsRawData) -> GithubClient: - """Create a GithubClient object with a mocked GhApi object.""" - gh_client = GithubClient("token") - gh_client._client = MagicMock() - gh_client._client.actions.list_jobs_for_workflow_run.return_value = { - "jobs": [ - { - "created_at": job_stats_raw.created_at, - "started_at": job_stats_raw.started_at, - "runner_name": job_stats_raw.runner_name, - "conclusion": job_stats_raw.conclusion, - "id": job_stats_raw.id, - } - ] - } - - return gh_client - - -def _mock_multiple_pages_for_job_response( - github_client: GithubClient, job_stats_raw: JobStatsRawData, include_runner: bool = True -): - """Mock the list_jobs_for_workflow_run to return multiple pages. - - Args: - github_client: The GithubClient object to mock. - job_stats_raw: The JobStatsRawData object to use for the response. - include_runner: Whether to include the runner in the response for one of the jobs. - """ - no_of_pages = random.choice(range(1, 5)) - no_of_jobs_per_page = random.choice(range(1, 4)) - runner_names = [secrets.token_hex(16) for _ in range(no_of_pages * no_of_jobs_per_page)] - - if include_runner: - runner_names[random.choice(range(no_of_pages))] = job_stats_raw.runner_name - - github_client._client.actions.list_jobs_for_workflow_run.side_effect = [ - { - "jobs": [ - { - "created_at": job_stats_raw.created_at, - "started_at": job_stats_raw.started_at, - "runner_name": runner_names[i * no_of_jobs_per_page + j], - "conclusion": job_stats_raw.conclusion, - "id": job_stats_raw.id, - } - for j in range(no_of_jobs_per_page) - ] - } - for i in range(no_of_pages) - ] + [{"jobs": []}] - - -def test_get_job_info(github_client: GithubClient, job_stats_raw: JobStatsRawData): - """ - arrange: A mocked Github Client that returns one page of jobs containing one job \ - with the runner. - act: Call get_job_info. - assert: The correct JobStats object is returned. - """ - github_repo = GithubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) - job_stats = github_client.get_job_info( - path=github_repo, - workflow_run_id=secrets.token_hex(16), - runner_name=job_stats_raw.runner_name, - ) - assert job_stats == JobStats( - created_at=datetime(2021, 10, 1, 0, 0, 0, tzinfo=timezone.utc), - started_at=datetime(2021, 10, 1, 1, 0, 0, tzinfo=timezone.utc), - runner_name=job_stats_raw.runner_name, - conclusion=JobConclusion.SUCCESS, - job_id=job_stats_raw.id, - ) - - -def test_get_job_info_no_conclusion(github_client: GithubClient, job_stats_raw: JobStatsRawData): - """ - arrange: A mocked Github Client that returns one page of jobs containing one job \ - with the runner with conclusion set to None. - act: Call get_job_info. - assert: JobStats object with conclusion set to None is returned. - """ - github_client._client.actions.list_jobs_for_workflow_run.return_value = { - "jobs": [ - { - "created_at": job_stats_raw.created_at, - "started_at": job_stats_raw.started_at, - "runner_name": job_stats_raw.runner_name, - "conclusion": None, - "id": job_stats_raw.id, - } - ] - } - github_repo = GithubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) - job_stats = github_client.get_job_info( - path=github_repo, - workflow_run_id=secrets.token_hex(16), - runner_name=job_stats_raw.runner_name, - ) - assert job_stats == JobStats( - created_at=datetime(2021, 10, 1, 0, 0, 0, tzinfo=timezone.utc), - started_at=datetime(2021, 10, 1, 1, 0, 0, tzinfo=timezone.utc), - runner_name=job_stats_raw.runner_name, - conclusion=None, - job_id=job_stats_raw.id, - ) - - -def test_github_api_pagination_multiple_pages( - github_client: GithubClient, job_stats_raw: JobStatsRawData -): - """ - arrange: A mocked Github Client that returns multiple pages of jobs containing \ - one job with the runner. - act: Call get_job_info. - assert: The correct JobStats object is returned. - """ - _mock_multiple_pages_for_job_response( - github_client=github_client, job_stats_raw=job_stats_raw, include_runner=True - ) - - github_repo = GithubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) - job_stats = github_client.get_job_info( - path=github_repo, - workflow_run_id=secrets.token_hex(16), - runner_name=job_stats_raw.runner_name, - ) - assert job_stats == JobStats( - created_at=datetime(2021, 10, 1, 0, 0, 0, tzinfo=timezone.utc), - started_at=datetime(2021, 10, 1, 1, 0, 0, tzinfo=timezone.utc), - runner_name=job_stats_raw.runner_name, - conclusion=JobConclusion.SUCCESS, - job_id=job_stats_raw.id, - ) - - -def test_github_api_pagination_job_not_found( - github_client: GithubClient, job_stats_raw: JobStatsRawData -): - """ - arrange: A mocked Github Client that returns multiple pages of jobs containing \ - no job with the runner. - act: Call get_job_info. - assert: An exception is raised. - """ - _mock_multiple_pages_for_job_response( - github_client=github_client, job_stats_raw=job_stats_raw, include_runner=False - ) - - github_repo = GithubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) - - with pytest.raises(JobNotFoundError): - github_client.get_job_info( - path=github_repo, - workflow_run_id=secrets.token_hex(16), - runner_name=job_stats_raw.runner_name, - ) - - -def test_github_api_http_error(github_client: GithubClient, job_stats_raw: JobStatsRawData): - github_client._client.actions.list_jobs_for_workflow_run.side_effect = HTTPError( - "http://test.com", 500, "", http.client.HTTPMessage(), None - ) - github_repo = GithubRepo(owner=secrets.token_hex(16), repo=secrets.token_hex(16)) - - with pytest.raises(JobNotFoundError): - github_client.get_job_info( - path=github_repo, - workflow_run_id=secrets.token_hex(16), - runner_name=job_stats_raw.runner_name, - ) diff --git a/tests/unit/test_runner_manager.py b/tests/unit/test_lxd_runner_manager.py similarity index 89% rename from tests/unit/test_runner_manager.py rename to tests/unit/test_lxd_runner_manager.py index 66b09cd60..215cbe7e0 100644 --- a/tests/unit/test_runner_manager.py +++ b/tests/unit/test_lxd_runner_manager.py @@ -1,34 +1,37 @@ # Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. -"""Test cases of RunnerManager class.""" +"""Test cases of LXDRunnerManager class.""" import random import secrets from pathlib import Path from unittest.mock import MagicMock, call +import github_runner_manager.reactive.runner_manager import pytest +from github_runner_manager.metrics.events import ( + Reconciliation, + RunnerInstalled, + RunnerStart, + RunnerStop, +) +from github_runner_manager.metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME +from github_runner_manager.metrics.storage import MetricsStorage +from github_runner_manager.types_.github import GitHubOrg, GitHubRepo, RunnerApplication from pytest import LogCaptureFixture, MonkeyPatch -import reactive.runner_manager import shared_fs from charm_state import ( Arch, CharmConfig, CharmState, - GithubOrg, - GithubRepo, ProxyConfig, ReactiveConfig, VirtualMachineResources, ) from errors import IssueMetricEventError, RunnerBinaryError -from github_type import RunnerApplication -from metrics.events import Reconciliation, RunnerInstalled, RunnerStart, RunnerStop -from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME -from metrics.storage import MetricsStorage from runner import Runner, RunnerStatus -from runner_manager import BUILD_IMAGE_SCRIPT_FILENAME, RunnerManager, RunnerManagerConfig +from runner_manager import BUILD_IMAGE_SCRIPT_FILENAME, LXDRunnerManager, LXDRunnerManagerConfig from runner_type import RunnerNameByHealth from tests.unit.mock import TEST_BINARY, MockLxdImageManager @@ -67,9 +70,9 @@ def charm_state_fixture(charm_config: MagicMock): scope="function", name="runner_manager", params=[ - (GithubOrg("test_org", "test_group"), ProxyConfig()), + (GitHubOrg("test_org", "test_group"), ProxyConfig()), ( - GithubRepo("test_owner", "test_repo"), + GitHubRepo("test_owner", "test_repo"), ProxyConfig( no_proxy="test_no_proxy", http=TEST_PROXY_SERVER_URL, @@ -82,15 +85,15 @@ def charm_state_fixture(charm_config: MagicMock): def runner_manager_fixture(request, tmp_path, monkeypatch, token, charm_state): charm_state.proxy_config = request.param[1] monkeypatch.setattr( - "runner_manager.RunnerManager.runner_bin_path", tmp_path / "mock_runner_binary" + "runner_manager.LXDRunnerManager.runner_bin_path", tmp_path / "mock_runner_binary" ) pool_path = tmp_path / "test_storage" pool_path.mkdir(exist_ok=True) - runner_manager = RunnerManager( + runner_manager = LXDRunnerManager( "test app", "0", - RunnerManagerConfig( + LXDRunnerManagerConfig( path=request.param[0], token=token, image=IMAGE_NAME, @@ -107,7 +110,7 @@ def runner_manager_fixture(request, tmp_path, monkeypatch, token, charm_state): def issue_event_mock_fixture(monkeypatch: MonkeyPatch) -> MagicMock: """Mock the issue_event function.""" issue_event_mock = MagicMock() - monkeypatch.setattr("metrics.events.issue_event", issue_event_mock) + monkeypatch.setattr("github_runner_manager.metrics.events.issue_event", issue_event_mock) return issue_event_mock @@ -131,7 +134,7 @@ def runner_metrics_fixture(monkeypatch: MonkeyPatch) -> MagicMock: @pytest.fixture(name="reactive_reconcile_mock") def reactive_reconcile_fixture(monkeypatch: MonkeyPatch, tmp_path: Path) -> MagicMock: """Mock the job class.""" - reconcile_mock = MagicMock(spec=reactive.runner_manager.reconcile) + reconcile_mock = MagicMock(spec=github_runner_manager.reactive.runner_manager.reconcile) monkeypatch.setattr("runner_manager.reactive_runner_manager.reconcile", reconcile_mock) reconcile_mock.side_effect = lambda quantity, **kwargs: quantity return reconcile_mock @@ -144,7 +147,7 @@ def reactive_reconcile_fixture(monkeypatch: MonkeyPatch, tmp_path: Path) -> Magi pytest.param(Arch.X64), ], ) -def test_get_latest_runner_bin_url(runner_manager: RunnerManager, arch: Arch, charm_state): +def test_get_latest_runner_bin_url(runner_manager: LXDRunnerManager, arch: Arch, charm_state): """ arrange: Nothing. act: Get runner bin url of existing binary. @@ -168,7 +171,7 @@ def test_get_latest_runner_bin_url(runner_manager: RunnerManager, arch: Arch, ch assert runner_bin["filename"] == filename -def test_get_latest_runner_bin_url_missing_binary(runner_manager: RunnerManager): +def test_get_latest_runner_bin_url_missing_binary(runner_manager: LXDRunnerManager): """ arrange: Given a mocked GH API client that does not return any runner binaries. act: Get runner bin url of non-existing binary. @@ -181,7 +184,7 @@ def test_get_latest_runner_bin_url_missing_binary(runner_manager: RunnerManager) runner_manager.get_latest_runner_bin_url(os_name="not_exist") -def test_update_runner_bin(runner_manager: RunnerManager): +def test_update_runner_bin(runner_manager: LXDRunnerManager): """ arrange: Remove the existing runner binary. act: Update runner binary. @@ -222,7 +225,7 @@ def iter_content(self, *args, **kwargs): assert runner_manager.runner_bin_path.read_bytes() == TEST_BINARY -def test_reconcile_zero_count(runner_manager: RunnerManager): +def test_reconcile_zero_count(runner_manager: LXDRunnerManager): """ arrange: Nothing. act: Reconcile with the current amount of runner. @@ -234,7 +237,7 @@ def test_reconcile_zero_count(runner_manager: RunnerManager): assert delta == 0 -def test_reconcile_create_runner(runner_manager: RunnerManager): +def test_reconcile_create_runner(runner_manager: LXDRunnerManager): """ arrange: Nothing. act: Reconcile to create a runner. @@ -246,7 +249,7 @@ def test_reconcile_create_runner(runner_manager: RunnerManager): assert delta == 1 -def test_reconcile_remove_runner(runner_manager: RunnerManager): +def test_reconcile_remove_runner(runner_manager: LXDRunnerManager): """ arrange: Create online runners. act: Reconcile to remove a runner. @@ -282,7 +285,7 @@ def mock_get_runners(): assert delta == -1 -def test_reconcile(runner_manager: RunnerManager, tmp_path: Path): +def test_reconcile(runner_manager: LXDRunnerManager, tmp_path: Path): """ arrange: Setup one runner. act: Reconcile with the current amount of runner. @@ -295,7 +298,7 @@ def test_reconcile(runner_manager: RunnerManager, tmp_path: Path): assert len(runner_manager._get_runners()) == 1 -def test_empty_flush(runner_manager: RunnerManager): +def test_empty_flush(runner_manager: LXDRunnerManager): """ arrange: No initial runners. act: Perform flushing with no runners. @@ -305,7 +308,7 @@ def test_empty_flush(runner_manager: RunnerManager): runner_manager.flush() -def test_flush(runner_manager: RunnerManager, tmp_path: Path): +def test_flush(runner_manager: LXDRunnerManager, tmp_path: Path): """ arrange: Create some runners. act: Perform flushing. @@ -319,7 +322,7 @@ def test_flush(runner_manager: RunnerManager, tmp_path: Path): def test_reconcile_issues_runner_installed_event( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, monkeypatch: MonkeyPatch, issue_event_mock: MagicMock, charm_state: MagicMock, @@ -341,7 +344,7 @@ def test_reconcile_issues_runner_installed_event( def test_reconcile_issues_no_runner_installed_event_if_metrics_disabled( - runner_manager: RunnerManager, issue_event_mock: MagicMock, charm_state: MagicMock + runner_manager: LXDRunnerManager, issue_event_mock: MagicMock, charm_state: MagicMock ): """ arrange: Disable issuing of metrics. @@ -356,7 +359,7 @@ def test_reconcile_issues_no_runner_installed_event_if_metrics_disabled( def test_reconcile_error_on_issue_event_is_ignored( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, issue_event_mock: MagicMock, charm_state: MagicMock, ): @@ -375,7 +378,7 @@ def test_reconcile_error_on_issue_event_is_ignored( def test_reconcile_issues_reconciliation_metric_event( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, monkeypatch: MonkeyPatch, issue_event_mock: MagicMock, runner_metrics: MagicMock, @@ -458,7 +461,7 @@ def mock_get_runners(): def test_reconcile_places_timestamp_in_newly_created_runner( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, monkeypatch: MonkeyPatch, shared_fs: MagicMock, tmp_path: Path, @@ -485,7 +488,7 @@ def test_reconcile_places_timestamp_in_newly_created_runner( def test_reconcile_error_on_placing_timestamp_is_ignored( - runner_manager: RunnerManager, shared_fs: MagicMock, tmp_path: Path, charm_state: MagicMock + runner_manager: LXDRunnerManager, shared_fs: MagicMock, tmp_path: Path, charm_state: MagicMock ): """ arrange: Enable issuing of metrics and do not create the directory for the shared filesystem\ @@ -504,7 +507,7 @@ def test_reconcile_error_on_placing_timestamp_is_ignored( def test_reconcile_places_no_timestamp_in_newly_created_runner_if_metrics_disabled( - runner_manager: RunnerManager, shared_fs: MagicMock, tmp_path: Path, charm_state: MagicMock + runner_manager: LXDRunnerManager, shared_fs: MagicMock, tmp_path: Path, charm_state: MagicMock ): """ arrange: Disable issuing of metrics, mock timestamps and the shared filesystem module. @@ -522,7 +525,7 @@ def test_reconcile_places_no_timestamp_in_newly_created_runner_if_metrics_disabl def test_reconcile_reactive_mode( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, reactive_reconcile_mock: MagicMock, caplog: LogCaptureFixture, ): @@ -542,7 +545,7 @@ def test_reconcile_reactive_mode( def test_schedule_build_runner_image( - runner_manager: RunnerManager, + runner_manager: LXDRunnerManager, tmp_path: Path, charm_state: CharmState, monkeypatch: MonkeyPatch, @@ -569,7 +572,7 @@ def test_schedule_build_runner_image( assert cronfile.read_text() == f"4 4,10,16,22 * * * ubuntu {cmd} jammy\n" -def test_has_runner_image(runner_manager: RunnerManager): +def test_has_runner_image(runner_manager: LXDRunnerManager): """ arrange: Multiple setups. 1. no runner image exists. diff --git a/tests/unit/test_openstack_cloud.py b/tests/unit/test_openstack_cloud.py deleted file mode 100644 index 4f599e914..000000000 --- a/tests/unit/test_openstack_cloud.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -from pathlib import Path - -import pytest -import yaml - -import openstack_cloud -from errors import OpenStackInvalidConfigError - - -def test_initialize(clouds_yaml_path: Path, clouds_yaml: dict): - """ - arrange: Mocked clouds.yaml data and path. - act: Call initialize. - assert: The clouds.yaml file is written to disk. - """ - openstack_cloud.initialize(clouds_yaml) - - assert yaml.safe_load(clouds_yaml_path.read_text(encoding="utf-8")) == clouds_yaml - - -@pytest.mark.parametrize( - "invalid_yaml, expected_err_msg", - [ - pytest.param( - {"wrong-key": {"cloud_name": {"auth": {}}}}, "Missing key 'clouds' from config." - ), - pytest.param({}, "Missing key 'clouds' from config."), - pytest.param({"clouds": {}}, "No clouds defined in clouds.yaml."), - ], -) -def test_initialize_validation_error(invalid_yaml: dict, expected_err_msg): - """ - arrange: Mocked clouds.yaml data with invalid data. - act: Call initialize. - assert: InvalidConfigError is raised. - """ - with pytest.raises(OpenStackInvalidConfigError) as exc: - openstack_cloud.initialize(invalid_yaml) - assert expected_err_msg in str(exc) diff --git a/tests/unit/test_openstack_manager.py b/tests/unit/test_openstack_manager.py deleted file mode 100644 index 5329b1282..000000000 --- a/tests/unit/test_openstack_manager.py +++ /dev/null @@ -1,1200 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -import random -import secrets -from pathlib import Path -from typing import Optional -from unittest.mock import MagicMock, call - -import jinja2 -import openstack.connection -import openstack.exceptions -import pytest -from fabric.connection import Connection as SSHConnection -from invoke import Result -from openstack.compute.v2.keypair import Keypair -from openstack.compute.v2.server import Server -from pytest import LogCaptureFixture, MonkeyPatch - -import metrics.storage -import reactive.runner_manager -from charm_state import CharmState, ProxyConfig, ReactiveConfig, RepoPolicyComplianceConfig -from errors import OpenStackError, RunnerStartError -from github_type import GitHubRunnerStatus, RunnerApplication, SelfHostedRunner -from metrics import events as metric_events -from metrics.runner import RUNNER_INSTALLED_TS_FILE_NAME -from metrics.storage import MetricsStorage -from openstack_cloud import openstack_manager -from openstack_cloud.openstack_manager import MAX_METRICS_FILE_SIZE, METRICS_EXCHANGE_PATH -from runner_manager_type import FlushMode -from runner_type import RunnerGithubInfo, RunnerNameByHealth -from tests.unit import factories - -FAKE_MONGODB_URI = "mongodb://example.com/db" -CLOUD_NAME = "microstack" - - -@pytest.fixture(autouse=True, name="openstack_connect_mock") -def mock_openstack_connect_fixture(monkeypatch: pytest.MonkeyPatch) -> MagicMock: - """Mock openstack.connect.""" - mock_connect = MagicMock(spec=openstack_manager.openstack.connect) - monkeypatch.setattr("openstack_cloud.openstack_manager.openstack.connect", mock_connect) - return mock_connect - - -@pytest.fixture(name="mock_server") -def mock_server_fixture() -> MagicMock: - """Mock OpenStack Server object.""" - mock_server = MagicMock(spec=Server) - mock_server.key_name = "mock_key" - mock_server.addresses.values = MagicMock(return_value=[[{"addr": "10.0.0.1"}]]) - return mock_server - - -@pytest.fixture(name="patch_get_ssh_connection_health_check") -def patch_get_ssh_connection_health_check_fixture(monkeypatch: pytest.MonkeyPatch): - """Patch SSH connection to a MagicMock instance for get_ssh_connection health check.""" - mock_get_ssh_connection = MagicMock( - spec=openstack_manager.OpenstackRunnerManager._get_ssh_connection - ) - mock_ssh_connection = MagicMock(spec=SSHConnection) - mock_ssh_connection.host = "test host IP" - mock_result = MagicMock(spec=Result) - mock_result.ok = True - mock_result.stderr = "" - mock_result.stdout = "hello world" - mock_ssh_connection.run.return_value = mock_result - mock_get_ssh_connection.return_value = [mock_ssh_connection] - - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - mock_get_ssh_connection, - ) - - -@pytest.fixture(name="ssh_connection_health_check") -def ssh_connection_health_check_fixture(monkeypatch: pytest.MonkeyPatch): - """SSH connection to a MagicMock instance for health check.""" - mock_get_ssh_connection = MagicMock( - spec=openstack_manager.OpenstackRunnerManager._get_ssh_connection - ) - mock_ssh_connection = MagicMock(spec=SSHConnection) - mock_ssh_connection.host = "test host IP" - mock_result = MagicMock(spec=Result) - mock_result.ok = True - mock_result.stderr = "" - mock_result.stdout = "-- Test output: /bin/bash /home/ubuntu/actions-runner/run.sh --" - mock_ssh_connection.run.return_value = mock_result - mock_get_ssh_connection.return_value = mock_ssh_connection - - return mock_get_ssh_connection - - -@pytest.fixture(name="patch_ssh_connection_error") -def patch_ssh_connection_error_fixture(monkeypatch: pytest.MonkeyPatch): - """Patch SSH connection to a MagicMock instance with error on run.""" - mock_get_ssh_connection = MagicMock( - spec=openstack_manager.OpenstackRunnerManager._get_ssh_connection - ) - mock_ssh_connection = MagicMock(spec=SSHConnection) - mock_result = MagicMock(spec=Result) - mock_result.ok = False - mock_result.stdout = "Mock stdout" - mock_result.stderr = "Mock stderr" - mock_ssh_connection.run.return_value = mock_result - mock_get_ssh_connection.return_value = mock_ssh_connection - - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - mock_get_ssh_connection, - ) - - -@pytest.fixture(name="mock_github_client") -def mock_github_client_fixture() -> MagicMock: - """Mocked github client that returns runner application.""" - mock_github_client = MagicMock(spec=openstack_manager.GithubClient) - mock_github_client.get_runner_application.return_value = RunnerApplication( - os="linux", - architecture="x64", - download_url="http://test_url", - filename="test_filename", - temp_download_token="test_token", - ) - mock_github_client.get_runner_registration_token.return_value = "test_token" - return mock_github_client - - -@pytest.fixture(name="patch_execute_command") -def patch_execute_command_fixture(monkeypatch: pytest.MonkeyPatch): - """Patch execute command to a MagicMock instance.""" - monkeypatch.setattr( - openstack_manager, - "execute_command", - MagicMock(spec=openstack_manager.execute_command), - ) - - -@pytest.fixture(name="patched_create_connection_context") -def patched_create_connection_context_fixture(monkeypatch: pytest.MonkeyPatch): - """Return a mocked openstack connection context manager and patch create_connection.""" - mock_connection = MagicMock(spec=openstack_manager.openstack.connection.Connection) - monkeypatch.setattr( - openstack_manager, - "_create_connection", - MagicMock(spec=openstack_manager._create_connection, return_value=mock_connection), - ) - return mock_connection.__enter__() - - -@pytest.fixture(name="ssh_connection_mock") -def ssh_connection_mock_fixture() -> MagicMock: - """Return a mocked ssh connection.""" - test_file_content = secrets.token_hex(16) - ssh_conn_mock = MagicMock(spec=openstack_manager.SSHConnection) - ssh_conn_mock.get.side_effect = lambda remote, local: Path(local).write_text(test_file_content) - ssh_conn_mock.run.side_effect = lambda cmd, **kwargs: ( - Result(stdout="1") if cmd.startswith("stat") else Result() - ) - ssh_conn_mock.run.return_value = Result() - - return ssh_conn_mock - - -@pytest.fixture(name="openstack_manager_for_reconcile") -def openstack_manager_for_reconcile_fixture( - monkeypatch: pytest.MonkeyPatch, - mock_github_client: MagicMock, - patched_create_connection_context: MagicMock, - tmp_path: Path, - ssh_connection_mock: MagicMock, -): - """Create a mocked openstack manager for the reconcile tests.""" - t_mock = MagicMock(return_value=12345) - monkeypatch.setattr(openstack_manager.time, "time", t_mock) - - issue_event_mock = MagicMock(spec=metric_events.issue_event) - monkeypatch.setattr(openstack_manager.metric_events, "issue_event", issue_event_mock) - - runner_metrics_mock = MagicMock(openstack_manager.runner_metrics) - monkeypatch.setattr(openstack_manager, "runner_metrics", runner_metrics_mock) - - github_metrics_mock = MagicMock(openstack_manager.github_metrics) - monkeypatch.setattr(openstack_manager, "github_metrics", github_metrics_mock) - - monkeypatch.setattr( - openstack_manager, "GithubClient", MagicMock(return_value=mock_github_client) - ) - - runner_metrics_path = tmp_path / "runner_fs" - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) - - pool_mock = MagicMock() - pool_mock.__enter__.return_value = pool_mock - pool_mock.map.side_effect = lambda func, iterable: func(*iterable) - pool_cls_mock = MagicMock() - pool_cls_mock.return_value = pool_mock - monkeypatch.setattr(openstack_manager, "Pool", pool_cls_mock) - - app_name = secrets.token_hex(16) - charm_state = MagicMock(spec=CharmState) - charm_state.proxy_config = ProxyConfig() - charm_state.ssh_debug_connections = MagicMock() - charm_state.charm_config = MagicMock() - charm_state.charm_config.repo_policy_compliance = None - os_runner_manager_config = openstack_manager.OpenstackRunnerManagerConfig( - charm_state=charm_state, - path=MagicMock(), - labels=[], - token=secrets.token_hex(16), - flavor=app_name, - image="test-image-id", - network=secrets.token_hex(16), - dockerhub_mirror=None, - ) - patched_create_connection_context.create_keypair.return_value = Keypair(private_key="test_key") - server_mock = MagicMock() - server_mock.status = openstack_manager._INSTANCE_STATUS_ACTIVE - patched_create_connection_context.get_server.return_value = server_mock - - os_runner_manager = openstack_manager.OpenstackRunnerManager( - app_name=app_name, - unit_num=0, - openstack_runner_manager_config=os_runner_manager_config, - cloud_config={}, - ) - os_runner_manager._ssh_health_check = MagicMock(return_value=True) - os_runner_manager._get_ssh_connection = MagicMock(return_value=ssh_connection_mock) - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, "_wait_until_runner_process_running", MagicMock() - ) - - monkeypatch.setattr(openstack_manager, "_SSH_KEY_PATH", tmp_path) - monkeypatch.setattr(openstack_manager.shutil, "chown", MagicMock()) - - return os_runner_manager - - -@pytest.fixture(name="reactive_reconcile_mock") -def reactive_reconcile_fixture(monkeypatch: MonkeyPatch, tmp_path: Path) -> MagicMock: - """Mock the job class.""" - reconcile_mock = MagicMock(spec=reactive.runner_manager.reconcile) - monkeypatch.setattr( - "openstack_cloud.openstack_manager.reactive_runner_manager.reconcile", reconcile_mock - ) - reconcile_mock.side_effect = lambda quantity, **kwargs: quantity - return reconcile_mock - - -def test__create_connection_error(clouds_yaml: dict, openstack_connect_mock: MagicMock): - """ - arrange: given a monkeypatched connection.authorize() function that raises an error. - act: when _create_connection is called. - assert: OpenStackUnauthorizedError is raised. - """ - connection_mock = MagicMock() - connection_context = MagicMock() - connection_context.authorize.side_effect = openstack.exceptions.HttpException - connection_mock.__enter__.return_value = connection_context - openstack_connect_mock.return_value = connection_mock - - with pytest.raises(OpenStackError) as exc: - with openstack_manager._create_connection(cloud_config=clouds_yaml): - pass - - assert "Failed OpenStack API call" in str(exc) - - -def test__create_connection( - multi_clouds_yaml: dict, clouds_yaml: dict, cloud_name: str, openstack_connect_mock: MagicMock -): - """ - arrange: given a cloud config yaml dict with 1. multiple clouds 2. single cloud. - act: when _create_connection is called. - assert: connection with first cloud in the config is used. - """ - # 1. multiple clouds - with openstack_manager._create_connection(cloud_config=multi_clouds_yaml): - openstack_connect_mock.assert_called_with(cloud=CLOUD_NAME) - - # 2. single cloud - with openstack_manager._create_connection(cloud_config=clouds_yaml): - openstack_connect_mock.assert_called_with(cloud=cloud_name) - - -@pytest.mark.parametrize( - "dockerhub_mirror, ssh_debug_connections, expected_env_contents", - [ - pytest.param( - None, - None, - """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - -LANG=C.UTF-8 -ACTIONS_RUNNER_HOOK_JOB_STARTED=/home/ubuntu/actions-runner/pre-job.sh -""", - id="all values empty", - ), - pytest.param( - "http://dockerhub_mirror.test", - None, - """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - -DOCKERHUB_MIRROR=http://dockerhub_mirror.test -CONTAINER_REGISTRY_URL=http://dockerhub_mirror.test - -LANG=C.UTF-8 -ACTIONS_RUNNER_HOOK_JOB_STARTED=/home/ubuntu/actions-runner/pre-job.sh -""", - id="dockerhub mirror set", - ), - pytest.param( - None, - [ - openstack_manager.SSHDebugConnection( - host="127.0.0.1", - port=10022, - rsa_fingerprint="SHA256:testrsa", - ed25519_fingerprint="SHA256:tested25519", - ) - ], - """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - -LANG=C.UTF-8 -ACTIONS_RUNNER_HOOK_JOB_STARTED=/home/ubuntu/actions-runner/pre-job.sh - -TMATE_SERVER_HOST=127.0.0.1 -TMATE_SERVER_PORT=10022 -TMATE_SERVER_RSA_FINGERPRINT=SHA256:testrsa -TMATE_SERVER_ED25519_FINGERPRINT=SHA256:tested25519 -""", - id="ssh debug connection set", - ), - pytest.param( - "http://dockerhub_mirror.test", - [ - openstack_manager.SSHDebugConnection( - host="127.0.0.1", - port=10022, - rsa_fingerprint="SHA256:testrsa", - ed25519_fingerprint="SHA256:tested25519", - ) - ], - """PATH=/home/ubuntu/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin - -DOCKERHUB_MIRROR=http://dockerhub_mirror.test -CONTAINER_REGISTRY_URL=http://dockerhub_mirror.test - -LANG=C.UTF-8 -ACTIONS_RUNNER_HOOK_JOB_STARTED=/home/ubuntu/actions-runner/pre-job.sh - -TMATE_SERVER_HOST=127.0.0.1 -TMATE_SERVER_PORT=10022 -TMATE_SERVER_RSA_FINGERPRINT=SHA256:testrsa -TMATE_SERVER_ED25519_FINGERPRINT=SHA256:tested25519 -""", - id="all values set", - ), - ], -) -def test__generate_runner_env( - dockerhub_mirror: Optional[str], - ssh_debug_connections: Optional[list[openstack_manager.SSHDebugConnection]], - expected_env_contents: str, -): - """ - arrange: given configuration values for runner environment. - act: when _generate_runner_env is called. - assert: expected .env contents are generated. - """ - environment = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"), autoescape=True) - assert ( - openstack_manager._generate_runner_env( - templates_env=environment, - dockerhub_mirror=dockerhub_mirror, - ssh_debug_connections=ssh_debug_connections, - ) - == expected_env_contents - ) - - -def test_reconcile_issues_runner_installed_event( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, -): - """ - arrange: Mock openstack manager for reconcile. - act: Reconcile to create a runner. - assert: The expected event is issued. - """ - openstack_manager_for_reconcile.reconcile(quantity=1) - - openstack_manager.metric_events.issue_event.assert_has_calls( - [ - call( - event=metric_events.RunnerInstalled( - timestamp=openstack_manager.time.time(), - flavor=openstack_manager_for_reconcile.app_name, - duration=0, - ) - ) - ] - ) - - -def test_reconcile_places_timestamp_in_metrics_storage( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -): - """ - arrange: Mock timestamps and create the directory for the metrics storage. - act: Reconcile to create a runner. - assert: The expected timestamp is placed in the shared filesystem. - """ - runner_metrics_path = tmp_path / "runner_fs" - runner_metrics_path.mkdir() - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - - openstack_manager_for_reconcile.reconcile(quantity=1) - - assert (ms.path / RUNNER_INSTALLED_TS_FILE_NAME).exists() - assert (ms.path / RUNNER_INSTALLED_TS_FILE_NAME).read_text() == str( - openstack_manager.time.time() - ) - - -def test_reconcile_error_on_placing_timestamp_is_ignored( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -): - """ - arrange: Do not create the directory for the metrics storage\ - in order to let a FileNotFoundError to be raised inside the OpenstackRunnerManager. - act: Reconcile to create a runner. - assert: No exception is raised. - """ - runner_metrics_path = tmp_path / "runner_fs" - - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - - openstack_manager_for_reconcile.reconcile(quantity=1) - - assert not (ms.path / RUNNER_INSTALLED_TS_FILE_NAME).exists() - - -def test_reconcile_pulls_metric_files( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, - ssh_connection_mock: MagicMock, -): - """ - arrange: Mock the metrics storage and the ssh connection. - act: Reconcile to create a runner. - assert: The expected metric files are pulled from the shared filesystem. - """ - runner_metrics_path = tmp_path / "runner_fs" - runner_metrics_path.mkdir() - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) - openstack_manager_for_reconcile._get_openstack_runner_status = MagicMock( - return_value=RunnerNameByHealth(healthy=(), unhealthy=("test_runner",)) - ) - ssh_connection_mock.get.side_effect = MagicMock() - openstack_manager_for_reconcile.reconcile(quantity=0) - - ssh_connection_mock.get.assert_any_call( - remote=str(METRICS_EXCHANGE_PATH / "pre-job-metrics.json"), - local=str(ms.path / "pre-job-metrics.json"), - ) - ssh_connection_mock.get.assert_any_call( - remote=str(METRICS_EXCHANGE_PATH / "post-job-metrics.json"), - local=str(ms.path / "post-job-metrics.json"), - ) - - -def test_reconcile_does_not_pull_too_large_files( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, - ssh_connection_mock: MagicMock, -): - """ - arrange: Mock the metrics storage and the ssh connection to return a file that is too large. - act: Reconcile to create a runner. - assert: The expected metric files are not pulled from the shared filesystem. - """ - runner_metrics_path = tmp_path / "runner_fs" - runner_metrics_path.mkdir() - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) - ssh_connection_mock.run.side_effect = lambda cmd, **kwargs: ( - Result(stdout=f"{MAX_METRICS_FILE_SIZE + 1}") if cmd.startswith("stat") else Result() - ) - openstack_manager_for_reconcile._get_openstack_runner_status = MagicMock( - return_value=RunnerNameByHealth(healthy=("test_runner",), unhealthy=()) - ) - - openstack_manager_for_reconcile.reconcile(quantity=0) - - assert not (ms.path / "pre-job-metrics.json").exists() - assert not (ms.path / "post-job-metrics.json").exists() - - -def test_reconcile_issue_reconciliation_metrics( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, -): - """ - arrange: Mock the metrics storage and the ssh connection. - act: Reconcile. - assert: The expected reconciliation metrics are issued. - """ - runner_metrics_path = tmp_path / "runner_fs" - runner_metrics_path.mkdir() - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) - openstack_manager_for_reconcile._get_openstack_runner_status = MagicMock( - return_value=RunnerNameByHealth(healthy=("test_runner",), unhealthy=()) - ) - - openstack_manager.runner_metrics.extract.return_value = (MagicMock() for _ in range(2)) - openstack_manager.runner_metrics.issue_events.side_effect = [ - {metric_events.RunnerStart, metric_events.RunnerStop}, - {metric_events.RunnerStart}, - ] - - openstack_manager_for_reconcile._github.get_runner_github_info.return_value = [ - SelfHostedRunner( - busy=False, - id=1, - labels=[], - os="linux", - name=f"{openstack_manager_for_reconcile.instance_name}-test_runner", - status=GitHubRunnerStatus.ONLINE, - ) - ] - openstack_manager_for_reconcile.reconcile(quantity=0) - - openstack_manager.metric_events.issue_event.assert_has_calls( - [ - call( - event=metric_events.Reconciliation( - timestamp=12345, - flavor=openstack_manager_for_reconcile.app_name, - crashed_runners=1, - idle_runners=1, - duration=0, - ) - ) - ] - ) - - -def test_reconcile_ignores_metrics_for_openstack_online_runners( - openstack_manager_for_reconcile, - monkeypatch, - tmp_path, - patched_create_connection_context: MagicMock, -): - """ - arrange: Combination of runner status/github status and openstack status. - act: Call reconcile. - assert: All runners which have an instance on Openstack are ignored for metrics extraction. - """ - runner_metrics_path = tmp_path / "runner_fs" - runner_metrics_path.mkdir() - ms = MetricsStorage(path=runner_metrics_path, runner_name="test_runner") - monkeypatch.setattr(openstack_manager.metrics_storage, "create", MagicMock(return_value=ms)) - monkeypatch.setattr(openstack_manager.metrics_storage, "get", MagicMock(return_value=ms)) - instance_name = openstack_manager_for_reconcile.instance_name - runner_names = { - k: f"{instance_name}-{k}" - for k in [ - "healthy_online", - "healthy_offline", - "unhealthy_online", - "unhealthy_offline", - "openstack_online_no_github_status", - "github_online_no_openstack_status", - ] - } - openstack_manager_for_reconcile._get_openstack_runner_status = MagicMock( - return_value=RunnerNameByHealth( - healthy=(runner_names["healthy_online"], runner_names["healthy_offline"]), - unhealthy=( - runner_names["unhealthy_online"], - runner_names["unhealthy_offline"], - runner_names["github_online_no_openstack_status"], - ), - ) - ) - openstack_manager_for_reconcile.get_github_runner_info = MagicMock( - return_value=( - RunnerGithubInfo( - runner_name=runner_names["healthy_online"], runner_id=0, online=True, busy=True - ), - RunnerGithubInfo( - runner_name=runner_names["unhealthy_online"], runner_id=1, online=True, busy=False - ), - RunnerGithubInfo( - runner_name=runner_names["healthy_offline"], runner_id=2, online=False, busy=False - ), - RunnerGithubInfo( - runner_name=runner_names["unhealthy_offline"], - runner_id=3, - online=False, - busy=False, - ), - RunnerGithubInfo( - runner_name=runner_names["github_online_no_openstack_status"], - runner_id=4, - online=True, - busy=False, - ), - ) - ) - - openstack_online_runner_names = [ - runner - for (name, runner) in runner_names.items() - if name != "github_online_no_openstack_status" - ] - openstack_instances = [ - openstack_manager.openstack.compute.v2.server.Server( - name=runner_name, status=random.choice(("ACTIVE", "BUILD", "STOPPED")) - ) - for runner_name in openstack_online_runner_names - ] - patched_create_connection_context.list_servers.return_value = openstack_instances - - openstack_manager.runner_metrics.extract.return_value = (MagicMock() for _ in range(1)) - openstack_manager.runner_metrics.issue_events.side_effect = [ - {metric_events.RunnerStart, metric_events.RunnerStop}, - ] - - openstack_manager_for_reconcile.reconcile(quantity=0) - - openstack_manager.runner_metrics.extract.assert_called_once_with( - metrics_storage_manager=metrics.storage, - runners=set(openstack_online_runner_names), - ) - - -def test_reconcile_reactive_mode( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - reactive_reconcile_mock: MagicMock, - caplog: LogCaptureFixture, -): - """ - arrange: Enable reactive mode and mock the job class to return a job. - act: Call reconcile with a random quantity n. - assert: The mocked job is picked up n times and the expected log message is present. - """ - count = random.randint(0, 5) - openstack_manager_for_reconcile._config.reactive_config = ReactiveConfig( - mq_uri=FAKE_MONGODB_URI - ) - actual_count = openstack_manager_for_reconcile.reconcile(quantity=count) - - assert actual_count == count - reactive_reconcile_mock.assert_called_with( - quantity=count, - mq_uri=FAKE_MONGODB_URI, - queue_name=openstack_manager_for_reconcile.app_name, - ) - - -def test_repo_policy_config( - openstack_manager_for_reconcile: openstack_manager.OpenstackRunnerManager, - monkeypatch: pytest.MonkeyPatch, - patched_create_connection_context: MagicMock, -): - """ - arrange: Mock the repo policy compliance config. - act: Reconcile to create a runner. - assert: The expected url and one-time-token is present in the pre-job script in \ - the cloud-init data. - """ - test_url = "http://test.url" - token = secrets.token_hex(16) - one_time_token = secrets.token_hex(16) - openstack_manager_for_reconcile._config.charm_state.charm_config.repo_policy_compliance = ( - RepoPolicyComplianceConfig(url=test_url, token=token) - ) - repo_policy_compliance_client_mock = MagicMock( - spec=openstack_manager.RepoPolicyComplianceClient - ) - repo_policy_compliance_client_mock.base_url = test_url - repo_policy_compliance_client_mock.get_one_time_token.return_value = one_time_token - repo_policy_compliance_cls_mock = MagicMock(return_value=repo_policy_compliance_client_mock) - monkeypatch.setattr( - openstack_manager, "RepoPolicyComplianceClient", repo_policy_compliance_cls_mock - ) - - openstack_manager_for_reconcile.reconcile(quantity=1) - - cloud_init_data_str = patched_create_connection_context.create_server.call_args[1]["userdata"] - repo_policy_compliance_client_mock.get_one_time_token.assert_called_once() - assert one_time_token in cloud_init_data_str - assert test_url in cloud_init_data_str - - -def test__ensure_security_group_with_existing_rules(): - """ - arrange: Mock OpenStack connection with the security rules created. - act: Run `_ensure_security_group`. - assert: The security rules are not created again. - """ - connection_mock = MagicMock(spec=openstack.connection.Connection) - connection_mock.list_security_groups.return_value = [ - { - "security_group_rules": [ - {"protocol": "icmp"}, - {"protocol": "tcp", "port_range_min": 22, "port_range_max": 22}, - {"protocol": "tcp", "port_range_min": 10022, "port_range_max": 10022}, - ], - "id": "TEST_ID", - } - ] - - openstack_manager.OpenstackRunnerManager._ensure_security_group(connection_mock) - connection_mock.create_security_group_rule.assert_not_called() - - -def test__get_ssh_connection( - monkeypatch, - patch_get_ssh_connection_health_check, - mock_server: MagicMock, -): - """ - arrange: A server with SSH setup correctly. - act: Get the SSH connections. - assert: The SSH connections contains at least one connection. - """ - # Patching the `_get_key_path` to get around the keyfile checks. - mock__get_key_path = MagicMock(spec=openstack_manager.OpenstackRunnerManager._get_key_path) - mock_key_path = MagicMock(spec=Path) - mock_key_path.exists.return_value = True - mock__get_key_path.return_value = mock_key_path - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, "_get_key_path", mock__get_key_path - ) - mock_connection = MagicMock(spec=openstack.connection.Connection) - mock_connection.get_server.return_value = mock_server - - conn = openstack_manager.OpenstackRunnerManager._get_ssh_connection( - mock_connection, mock_server.name - ) - assert conn is not None - - -@pytest.mark.usefixtures("skip_retry") -def test__ssh_health_check_success(monkeypatch: pytest.MonkeyPatch, mock_server: MagicMock): - """ - arrange: A server with SSH correctly setup. - act: Run health check on the server. - assert: The health check passes. - """ - ssh_connection_mock = MagicMock() - result_mock = MagicMock() - result_mock.stdout = "/home/ubuntu/actions-runner/run.sh\nRunner.Worker" - ssh_connection_mock.run.return_value = result_mock - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - MagicMock(return_value=ssh_connection_mock), - ) - mock_connection = MagicMock(spec=openstack.connection.Connection) - mock_connection.get_server.return_value = mock_server - - assert openstack_manager.OpenstackRunnerManager._ssh_health_check( - mock_connection, mock_server.name, False - ) - - -@pytest.mark.usefixtures("skip_retry") -def test__ssh_health_check_no_key(mock_server: MagicMock): - """ - arrange: A server with no key available. - act: Run health check on the server. - assert: The health check fails. - """ - # Remove the mock SSH key. - mock_server.key_name = None - - mock_connection = MagicMock(spec=openstack.connection.Connection) - mock_connection.get_server.return_value = mock_server - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._ssh_health_check( - mock_connection, mock_server.name, False - ) - - assert "no valid keypair found" in str(exc) - - -@pytest.mark.usefixtures("skip_retry") -def test__ssh_health_check_error(monkeypatch: pytest.MonkeyPatch, mock_server: MagicMock): - """ - arrange: A server with error on SSH run. - act: Run health check on the server. - assert: The health check fails. - """ - monkeypatch.setattr(openstack_manager.OpenstackRunnerManager, "_get_key_path", MagicMock()) - mock_connection = MagicMock(spec=openstack.connection.Connection) - mock_connection.get_server.return_value = mock_server - mock_ssh_connection = MagicMock() - mock_ssh_connection.run = MagicMock(side_effect=TimeoutError) - monkeypatch.setattr( - openstack_manager, "SSHConnection", MagicMock(return_value=mock_ssh_connection) - ) - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._ssh_health_check( - mock_connection, mock_server.name, False - ) - - assert "No connectable SSH addresses found" in str(exc) - - -def test__wait_until_runner_process_running_no_server(): - """ - arrange: No server existing on the OpenStack connection. - act: Check if runner process is running. - assert: RunnerStartError thrown. - """ - mock_connection = MagicMock(spec=openstack.connection.Connection) - mock_connection.get_server.return_value = None - - with pytest.raises(RunnerStartError): - openstack_manager.OpenstackRunnerManager._wait_until_runner_process_running( - mock_connection, "Non-existing-server" - ) - - -@pytest.mark.parametrize( - "server", - [ - pytest.param(None, id="no server"), - pytest.param(factories.MockOpenstackServer(status="SHUTOFF"), id="shutoff"), - pytest.param(factories.MockOpenstackServer(status="REBUILD"), id="not active/building"), - ], -) -def test__health_check(server: factories.MockOpenstackServer | None): - """ - arrange: given a mock openstack.get_server response. - act: when _health_check is called. - assert: False is returned, meaning unhealthy runner. - """ - mock_get_server = MagicMock(return_value=server) - mock_connection = MagicMock() - mock_connection.get_server = mock_get_server - - assert not openstack_manager.OpenstackRunnerManager._health_check( - conn=mock_connection, server_name="test" - ) - - -# The SSH health check will temporarily return True on failure for debugging purposes. -@pytest.mark.xfail -def test__ssh_health_check_connection_error(monkeypatch: pytest.MonkeyPatch): - """ - arrange: given a monkeypatched _get_ssh_connection function that raises _SSHError. - act: when _ssh_health_check is called. - assert: False is returned, meaning unhealthy runner. - """ - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - MagicMock(side_effect=openstack_manager._SSHError), - ) - - assert not openstack_manager.OpenstackRunnerManager._ssh_health_check( - server=MagicMock(), startup=False - ) - - -@pytest.mark.parametrize( - "result", - [ - pytest.param(factories.MockSSHRunResult(exited=1), id="ssh result not ok"), - pytest.param( - factories.MockSSHRunResult(exited=0, stdout=""), - id="runner process not found in stdout", - ), - # This health check should fail but temporarily marking as passing for passive runner - # deletion until we have more data. - pytest.param( - factories.MockSSHRunResult(exited=0, stdout="/home/ubuntu/actions-runner/run.sh"), - id="startup process exists but no listener or worker process", - ), - ], -) -@pytest.mark.xfail -def test__ssh_health_check_unhealthy( - monkeypatch: pytest.MonkeyPatch, result: factories.MockSSHRunResult -): - """ - arrange: given unhealthy ssh responses. - act: when _ssh_health_check is called. - assert: False is returned, meaning unhealthy runner. - """ - mock_ssh_connection = MagicMock() - mock_ssh_connection.run = MagicMock(return_value=result) - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - MagicMock(return_value=mock_ssh_connection), - ) - - assert not openstack_manager.OpenstackRunnerManager._ssh_health_check( - server=MagicMock(), startup=False - ) - - -@pytest.mark.parametrize( - "result, startup", - [ - pytest.param( - factories.MockSSHRunResult( - exited=0, stdout="/home/ubuntu/actions-runner/run.sh\nRunner.Worker" - ), - False, - id="runner process & workper process found", - ), - pytest.param( - factories.MockSSHRunResult( - exited=0, stdout="/home/ubuntu/actions-runner/run.sh\nRunner.Listener" - ), - False, - id="runner process & listener process found", - ), - pytest.param( - factories.MockSSHRunResult(exited=0, stdout="/home/ubuntu/actions-runner/run.sh"), - True, - id="runner process found for startup", - ), - ], -) -def test__ssh_health_check_healthy( - monkeypatch: pytest.MonkeyPatch, result: factories.MockSSHRunResult, startup: bool -): - """ - arrange: given healthy ssh response. - act: when _ssh_health_check is called. - assert: True is returned, meaning healthy runner. - """ - mock_ssh_connection = MagicMock() - mock_ssh_connection.run = MagicMock(return_value=result) - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_ssh_connection", - MagicMock(return_value=mock_ssh_connection), - ) - - assert openstack_manager.OpenstackRunnerManager._ssh_health_check( - conn=MagicMock(), server_name=MagicMock(), startup=startup - ) - - -@pytest.mark.usefixtures("skip_retry") -def test__get_ssh_connection_server_gone(): - """ - arrange: given a mock Openstack get_server function that returns None. - act: when _get_ssh_connection is called. - assert: _SSHError is raised. - """ - mock_connection = MagicMock() - mock_connection.get_server.return_value = None - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - - assert "Server gone while trying to get SSH connection" in str(exc.getrepr()) - - -@pytest.mark.usefixtures("skip_retry") -def test__get_ssh_connection_no_server_key(): - """ - arrange: given a mock server instance with no key attached. - act: when _get_ssh_connection is called. - assert: _SSHError is raised. - """ - mock_server = MagicMock() - mock_server.key_name = None - mock_connection = MagicMock() - mock_connection.get_server.return_value = mock_server - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - - assert "Unable to create SSH connection, no valid keypair found" in str(exc.getrepr()) - - -@pytest.mark.usefixtures("skip_retry") -def test__get_ssh_connection_key_not_exists(monkeypatch: pytest.MonkeyPatch): - """ - arrange: given a monkeypatched _get_key_path function that returns a non-existent path. - act: when _get_ssh_connection is called. - assert: _SSHError is raised. - """ - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_key_path", - MagicMock(return_value=Path("does-not-exist")), - ) - mock_connection = MagicMock() - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - - assert "Missing keyfile for server" in str(exc.getrepr()) - - -@pytest.mark.usefixtures("skip_retry") -def test__get_ssh_connection_server_no_addresses(monkeypatch: pytest.MonkeyPatch): - """ - arrange: given a mock server instance with no server addresses. - act: when _get_ssh_connection is called. - assert: _SSHError is raised. - """ - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_key_path", - MagicMock(return_value=Path(".")), - ) - mock_server = MagicMock() - mock_server.addresses = {} - mock_connection = MagicMock() - mock_connection.get_server.return_value = mock_server - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - - assert "No addresses found for OpenStack server" in str(exc.getrepr()) - - -@pytest.mark.usefixtures("skip_retry") -@pytest.mark.parametrize( - "run", - [ - pytest.param(MagicMock(side_effect=TimeoutError), id="timeout error"), - pytest.param( - MagicMock(return_value=factories.MockSSHRunResult(exited=1)), id="result not ok" - ), - pytest.param( - MagicMock(return_value=factories.MockSSHRunResult(exited=0, stdout="")), - id="empty response", - ), - ], -) -def test__get_ssh_connection_server_no_valid_connections( - monkeypatch: pytest.MonkeyPatch, run: MagicMock -): - """ - arrange: given a monkeypatched Connection instance that returns invalid connections. - act: when _get_ssh_connection is called. - assert: _SSHError is raised. - """ - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_key_path", - MagicMock(return_value=Path(".")), - ) - mock_server = MagicMock() - mock_server.addresses = {"test": [{"addr": "test-address"}]} - mock_connection = MagicMock() - mock_connection.get_server.return_value = mock_server - mock_ssh_connection = MagicMock() - mock_ssh_connection.run = run - monkeypatch.setattr( - openstack_manager, "SSHConnection", MagicMock(return_value=mock_ssh_connection) - ) - - with pytest.raises(openstack_manager._SSHError) as exc: - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - - assert "No connectable SSH addresses found" in str(exc.getrepr()) - - -@pytest.mark.usefixtures("skip_retry") -def test__get_ssh_connection_server(monkeypatch: pytest.MonkeyPatch): - """ - arrange: given monkeypatched SSH connection instance. - act: when _get_ssh_connection is called. - assert: the SSH connection instance is returned. - """ - monkeypatch.setattr( - openstack_manager.OpenstackRunnerManager, - "_get_key_path", - MagicMock(return_value=Path(".")), - ) - mock_server = MagicMock() - mock_server.addresses = {"test": [{"addr": "test-address"}]} - mock_connection = MagicMock() - mock_connection.get_server.return_value = mock_server - mock_ssh_connection = MagicMock() - mock_ssh_connection.run = MagicMock( - return_value=factories.MockSSHRunResult(exited=0, stdout="hello world") - ) - monkeypatch.setattr( - openstack_manager, "SSHConnection", MagicMock(return_value=mock_ssh_connection) - ) - - assert ( - openstack_manager.OpenstackRunnerManager._get_ssh_connection( - conn=mock_connection, server_name="test" - ) - == mock_ssh_connection - ) - - -def test_flush(monkeypatch: pytest.MonkeyPatch): - """ - arrange: given monkeypatched sub functions of flush. - act: when flush is called. - assert: sub functions are called. - """ - monkeypatch.setattr(openstack_manager, "_create_connection", MagicMock()) - monkeypatch.setattr(openstack_manager, "set_env_var", MagicMock()) - runner_manager = openstack_manager.OpenstackRunnerManager( - app_name=MagicMock(), - unit_num=MagicMock(), - openstack_runner_manager_config=MagicMock(), - cloud_config=MagicMock(), - ) - runner_manager._kill_runner_processes = MagicMock() - runner_manager._get_openstack_runner_status = MagicMock() - runner_manager._github = MagicMock() - runner_manager._remove_runners = MagicMock() - - runner_manager.flush(mode=MagicMock()) - - runner_manager._kill_runner_processes.assert_called() - runner_manager._get_openstack_runner_status.assert_called() - runner_manager._github.get_runner_remove_token.assert_called() - runner_manager._remove_runners.assert_called() - - -@pytest.mark.parametrize( - "flush_mode, expected_command", - [ - pytest.param( - FlushMode.FLUSH_BUSY, - "pgrep -x Runner.Listener && kill $(pgrep -x Runner.Listener);" - "pgrep -x Runner.Worker && kill $(pgrep -x Runner.Worker);", - id="Flush Busy", - ), - pytest.param( - FlushMode.FLUSH_IDLE, - "! pgrep -x Runner.Worker && pgrep -x Runner.Listener && " - "kill $(pgrep -x Runner.Listener)", - id="Flush Idle", - ), - ], -) -def test__kill_runner_processes( - monkeypatch: pytest.MonkeyPatch, flush_mode: FlushMode, expected_command: str -): - """ - arrange: given supported flush modes. - act: when _kill_runner_processes is called. - assert: expected kill commands are issued. - """ - monkeypatch.setattr(openstack_manager, "_create_connection", MagicMock()) - monkeypatch.setattr(openstack_manager, "set_env_var", MagicMock()) - runner_manager = openstack_manager.OpenstackRunnerManager( - app_name=MagicMock(), - unit_num=MagicMock(), - openstack_runner_manager_config=MagicMock(), - cloud_config=MagicMock(), - ) - runner_manager._get_openstack_instances = MagicMock(return_value=[MagicMock(), MagicMock()]) - mock_connection = MagicMock() - runner_manager._get_ssh_connection = MagicMock(return_value=mock_connection) - - runner_manager._kill_runner_processes(conn=MagicMock(), mode=flush_mode) - - mock_connection.run.assert_called_with(expected_command, warn=True) diff --git a/tests/unit/test_runner.py b/tests/unit/test_runner.py index fdf8fc2a1..e6d57f305 100644 --- a/tests/unit/test_runner.py +++ b/tests/unit/test_runner.py @@ -8,12 +8,14 @@ from pathlib import Path from unittest.mock import MagicMock, call +import github_runner_manager.metrics.runner_logs import jinja2 import pytest from _pytest.monkeypatch import MonkeyPatch +from github_runner_manager.metrics.storage import MetricsStorage +from github_runner_manager.types_.github import GitHubOrg, GitHubRepo -import metrics.runner_logs -from charm_state import GithubOrg, GithubRepo, SSHDebugConnection, VirtualMachineResources +from charm_state import SSHDebugConnection, VirtualMachineResources from errors import ( CreateMetricsStorageError, LxdError, @@ -22,7 +24,6 @@ RunnerRemoveError, ) from lxd import LxdInstance, LxdInstanceFileManager -from metrics.storage import MetricsStorage from runner import DIAG_DIR_PATH, CreateRunnerConfig, Runner, RunnerConfig, RunnerStatus from runner_manager_type import RunnerManagerClients from runner_type import ProxySetting @@ -102,7 +103,9 @@ def create_logs_dir(runner_name: str) -> Path: return target_log_path - create_logs_dir_mock = MagicMock(spec=metrics.runner_logs.create_logs_dir) + create_logs_dir_mock = MagicMock( + spec=github_runner_manager.metrics.runner_logs.create_logs_dir + ) create_logs_dir_mock.side_effect = create_logs_dir monkeypatch.setattr("runner.create_logs_dir", create_logs_dir_mock) @@ -138,11 +141,11 @@ def ssh_debug_connections_fixture() -> list[SSHDebugConnection]: name="runner", params=[ ( - GithubOrg("test_org", "test_group"), + GitHubOrg("test_org", "test_group"), ProxySetting(no_proxy=None, http=None, https=None, aproxy_address=None), ), ( - GithubRepo("test_owner", "test_repo"), + GitHubRepo("test_owner", "test_repo"), ProxySetting( no_proxy="test_no_proxy", http=TEST_PROXY_SERVER_URL, @@ -522,7 +525,7 @@ def test_pull_logs(runner: Runner, log_dir_base_path: Path): runner.instance.files.pull_file.assert_has_calls( [ call(str(DIAG_DIR_PATH), str(log_dir_path), is_dir=True), - call(str(metrics.runner_logs.SYSLOG_PATH), str(log_dir_path)), + call(str(github_runner_manager.metrics.runner_logs.SYSLOG_PATH), str(log_dir_path)), ] ) diff --git a/tests/unit/test_runner_scaler.py b/tests/unit/test_runner_scaler.py new file mode 100644 index 000000000..f3199fd99 --- /dev/null +++ b/tests/unit/test_runner_scaler.py @@ -0,0 +1,275 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + + +from typing import Iterable +from unittest.mock import MagicMock + +import pytest +from github_runner_manager.manager.cloud_runner_manager import CloudRunnerState, InstanceId +from github_runner_manager.manager.github_runner_manager import GitHubRunnerState +from github_runner_manager.manager.runner_manager import ( + FlushMode, + RunnerManager, + RunnerManagerConfig, +) +from github_runner_manager.manager.runner_scaler import RunnerScaler +from github_runner_manager.types_.github import GitHubPath, GitHubRepo + +from tests.unit.mock_runner_managers import ( + MockCloudRunnerManager, + MockGitHubRunnerManager, + SharedMockRunnerManagerState, +) + + +def mock_runner_manager_spawn_runners( + create_runner_args: Iterable[RunnerManager._CreateRunnerArgs], +) -> tuple[InstanceId, ...]: + """Mock _spawn_runners method of RunnerManager. + + The _spawn_runners method uses multi-process, which copies the object, e.g., the mocks. + There is easy way to sync the state of the mocks object across processes. Replacing the + _spawn_runner to remove the multi-process.pool is an easier approach. + + Args: + create_runner_args: The arguments for the create_runner method. + + Returns: + The instance ids of the runner spawned. + """ + return tuple(RunnerManager._create_runner(arg) for arg in create_runner_args) + + +@pytest.fixture(scope="function", name="github_path") +def github_path_fixture() -> GitHubPath: + return GitHubRepo("mock_owner", "mock_repo") + + +@pytest.fixture(scope="function", name="mock_runner_managers") +def mock_runner_managers_fixture( + github_path: GitHubPath, +) -> tuple[MockCloudRunnerManager, MockGitHubRunnerManager]: + state = SharedMockRunnerManagerState() + mock_cloud = MockCloudRunnerManager(state) + mock_github = MockGitHubRunnerManager(mock_cloud.name_prefix, github_path, state) + return (mock_cloud, mock_github) + + +@pytest.fixture(scope="function", name="runner_manager") +def runner_manager_fixture( + monkeypatch, mock_runner_managers, github_path: GitHubPath +) -> RunnerManager: + mock_cloud, mock_github = mock_runner_managers + monkeypatch.setattr( + "github_runner_manager.manager.runner_manager.RunnerManager._spawn_runners", + mock_runner_manager_spawn_runners, + ) + # Patch out the metrics, as metrics has their own tests. + monkeypatch.setattr( + "github_runner_manager.manager.runner_manager.github_metrics.job", MagicMock() + ) + monkeypatch.setattr( + "github_runner_manager.manager.runner_manager.runner_metrics.issue_events", MagicMock() + ) + + config = RunnerManagerConfig("mock_token", github_path) + runner_manager = RunnerManager("mock_runners", mock_cloud, config) + runner_manager._github = mock_github + return runner_manager + + +@pytest.fixture(scope="function", name="runner_scaler") +def runner_scaler_fixture(runner_manager: RunnerManager) -> RunnerScaler: + return RunnerScaler(runner_manager, None) + + +@pytest.fixture(scope="function", name="runner_scaler_one_runner") +def runner_scaler_one_runner_fixture(runner_scaler: RunnerScaler) -> RunnerScaler: + runner_scaler.reconcile(1) + assert_runner_info(runner_scaler, online=1) + return runner_scaler + + +def set_one_runner_state( + runner_scaler: RunnerScaler, + github_state: GitHubRunnerState | None = None, + cloud_state: CloudRunnerState | None = None, +): + """Set the runner state for a RunnerScaler with one runner. + + Args: + runner_scaler: The RunnerScaler instance to modify. + github_state: The github state to set the runner. + cloud_state: The cloud state to set the runner. + """ + runner_dict = runner_scaler._manager._github.state.runners + assert len(runner_dict) == 1, "Test arrange failed: One runner should be present" + instance_id = list(runner_dict.keys())[0] + if github_state is not None: + runner_dict[instance_id].github_state = github_state + if cloud_state is not None: + runner_dict[instance_id].cloud_state = cloud_state + + +def assert_runner_info( + runner_scaler: RunnerScaler, online: int = 0, busy: int = 0, offline: int = 0, unknown: int = 0 +) -> None: + """Assert runner info contains a certain amount of runners. + + Args: + runner_scaler: The RunnerScaler to get information from. + online: The number of online runners to assert for. + busy: The number of buys runners to assert for. + offline: The number of offline runners to assert for. + unknown: The number of unknown runners to assert for. + """ + info = runner_scaler.get_runner_info() + assert info.offline == offline + assert info.online == online + assert info.busy == busy + assert info.unknown == unknown + assert isinstance(info.runners, tuple) + assert len(info.runners) == online + assert isinstance(info.busy_runners, tuple) + assert len(info.busy_runners) == busy + + +def test_get_no_runner(runner_scaler: RunnerScaler): + """ + Arrange: A RunnerScaler with no runners. + Act: Get runner information. + Assert: Information should contain no runners. + """ + assert_runner_info(runner_scaler, online=0) + + +def test_flush_no_runner(runner_scaler: RunnerScaler): + """ + Arrange: A RunnerScaler with no runners. + Act: + 1. Flush idle runners. + 2. Flush busy runners. + Assert: + 1. No change in number of runners. Runner info should contain no runners. + 2. No change in number of runners. + """ + # 1. + diff = runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) + assert diff == 0 + assert_runner_info(runner_scaler, online=0) + + # 2. + diff = runner_scaler.flush(flush_mode=FlushMode.FLUSH_BUSY) + assert diff == 0 + assert_runner_info(runner_scaler, online=0) + + +def test_reconcile_runner_create_one(runner_scaler: RunnerScaler): + """ + Arrange: A RunnerScaler with no runners. + Act: Reconcile to no runners. + Assert: No changes. Runner info should contain no runners. + """ + diff = runner_scaler.reconcile(quantity=0) + assert diff == 0 + assert_runner_info(runner_scaler, online=0) + + +def test_one_runner(runner_scaler: RunnerScaler): + """ + Arrange: A RunnerScaler with no runners. + Act: + 1. Reconcile to one runner. + 2. Reconcile to one runner. + 3. Flush idle runners. + 4. Reconcile to one runner. + Assert: + 1. Runner info has one runner. + 2. No changes to number of runner. + 3. No runners. + 4. Runner info has one runner. + """ + # 1. + diff = runner_scaler.reconcile(1) + assert diff == 1 + assert_runner_info(runner_scaler, online=1) + + # 2. + diff = runner_scaler.reconcile(1) + assert diff == 0 + assert_runner_info(runner_scaler, online=1) + + # 3. + runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE) + assert_runner_info(runner_scaler, online=0) + + # 4. + diff = runner_scaler.reconcile(1) + assert diff == 1 + assert_runner_info(runner_scaler, online=1) + + +def test_flush_busy_on_idle_runner(runner_scaler_one_runner: RunnerScaler): + """ + Arrange: A RunnerScaler with one idle runner. + Act: Run flush busy runner. + Assert: No runners. + """ + runner_scaler = runner_scaler_one_runner + + runner_scaler.flush(flush_mode=FlushMode.FLUSH_BUSY) + assert_runner_info(runner_scaler, online=0) + + +def test_flush_busy_on_busy_runner( + runner_scaler_one_runner: RunnerScaler, +): + """ + Arrange: A RunnerScaler with one busy runner. + Act: Run flush busy runner. + Assert: No runners. + """ + runner_scaler = runner_scaler_one_runner + set_one_runner_state(runner_scaler, GitHubRunnerState.BUSY) + + runner_scaler.flush(flush_mode=FlushMode.FLUSH_BUSY) + assert_runner_info(runner_scaler, online=0) + + +def test_get_runner_one_busy_runner( + runner_scaler_one_runner: RunnerScaler, +): + """ + Arrange: A RunnerScaler with one busy runner. + Act: Run get runners. + Assert: One busy runner. + """ + runner_scaler = runner_scaler_one_runner + set_one_runner_state(runner_scaler, GitHubRunnerState.BUSY) + + assert_runner_info(runner_scaler=runner_scaler, online=1, busy=1) + + +def test_get_runner_offline_runner(runner_scaler_one_runner: RunnerScaler): + """ + Arrange: A RunnerScaler with one offline runner. + Act: Run get runners. + Assert: One offline runner. + """ + runner_scaler = runner_scaler_one_runner + set_one_runner_state(runner_scaler, GitHubRunnerState.OFFLINE) + + assert_runner_info(runner_scaler=runner_scaler, offline=1) + + +def test_get_runner_unknown_runner(runner_scaler_one_runner: RunnerScaler): + """ + Arrange: A RunnerScaler with one offline runner. + Act: Run get runners. + Assert: One offline runner. + """ + runner_scaler = runner_scaler_one_runner + set_one_runner_state(runner_scaler, "UNKNOWN") + + assert_runner_info(runner_scaler=runner_scaler, unknown=1) diff --git a/tests/unit/test_shared_fs.py b/tests/unit/test_shared_fs.py index 0c1266566..2a21bf3cc 100644 --- a/tests/unit/test_shared_fs.py +++ b/tests/unit/test_shared_fs.py @@ -7,6 +7,7 @@ import pytest from _pytest.monkeypatch import MonkeyPatch +from github_runner_manager.metrics.storage import MetricsStorage import shared_fs from errors import ( @@ -15,7 +16,6 @@ GetMetricsStorageError, SubprocessError, ) -from metrics.storage import MetricsStorage MOUNTPOINT_FAILURE_EXIT_CODE = 1