diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 3f549c8ae..a52b62ca9 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -34,7 +34,7 @@ jobs: EOF - name: Cache github-runner Charm - uses: actions/cache@v3 + uses: actions/cache@v4 id: cache-charm with: path: github-runner_ubuntu-22.04-amd64.charm diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index c058dd3bf..b6bdea75a 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -13,7 +13,7 @@ jobs: pre-run-script: scripts/pre-integration-test.sh provider: lxd test-tox-env: integration-juju2.9 - modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]' + modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]' integration-tests-juju3: name: Integration test with juju 3.1 uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main @@ -23,4 +23,4 @@ jobs: pre-run-script: scripts/pre-integration-test.sh provider: lxd test-tox-env: integration-juju3.1 - modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]' + modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]' diff --git a/.github/workflows/workflow_dispatch_ssh_debug.yaml b/.github/workflows/workflow_dispatch_ssh_debug.yaml index a44458bc4..e4f388c14 100644 --- a/.github/workflows/workflow_dispatch_ssh_debug.yaml +++ b/.github/workflows/workflow_dispatch_ssh_debug.yaml @@ -14,4 +14,4 @@ jobs: steps: - name: Setup tmate session uses: canonical/action-tmate@chore/env_var_change - timeout-minutes: 1 + timeout-minutes: 5 diff --git a/scripts/build-image.sh b/scripts/build-image.sh index 315bcd09c..5fb616feb 100644 --- a/scripts/build-image.sh +++ b/scripts/build-image.sh @@ -87,6 +87,8 @@ retry '/snap/bin/lxc exec builder -- /usr/bin/nslookup github.com' 'Wait for net /snap/bin/lxc exec builder -- /usr/bin/apt-get update /snap/bin/lxc exec builder --env DEBIAN_FRONTEND=noninteractive -- /usr/bin/apt-get upgrade -yq /snap/bin/lxc exec builder --env DEBIAN_FRONTEND=noninteractive -- /usr/bin/apt-get install linux-generic-hwe-22.04 -yq +# This will remove older version of kernel as HWE is installed now. +/snap/bin/lxc exec builder -- /usr/bin/apt-get autoremove --purge /snap/bin/lxc restart builder retry '/snap/bin/lxc exec builder -- /usr/bin/who' 'Wait for lxd agent to be ready' 30 @@ -107,6 +109,10 @@ fi /snap/bin/lxc exec builder -- /usr/sbin/usermod -aG docker ubuntu /snap/bin/lxc exec builder -- /usr/sbin/iptables -I DOCKER-USER -j ACCEPT +# Reduce image size +/snap/bin/lxc exec builder -- /usr/bin/npm cache clean --force +/snap/bin/lxc exec builder -- /usr/bin/apt-get clean + # Download and verify checksum of yq if [[ $(uname -m) == 'aarch64' ]]; then YQ_ARCH="arm64" diff --git a/src-docs/event_timer.py.md b/src-docs/event_timer.py.md index aa0b2290b..24b5c4422 100644 --- a/src-docs/event_timer.py.md +++ b/src-docs/event_timer.py.md @@ -48,7 +48,7 @@ Construct the timer manager. --- - + ### function `disable_event_timer` @@ -96,6 +96,7 @@ The timeout is the number of seconds before an event is timed out. If not set or - `event_name`: Name of the juju event to schedule. - `interval`: Number of minutes between emitting each event. + - `timeout`: Timeout for each event handle in minutes. diff --git a/src/charm.py b/src/charm.py index 87e9fb828..a058a85ac 100755 --- a/src/charm.py +++ b/src/charm.py @@ -163,7 +163,6 @@ def __init__(self, *args, **kargs) -> None: path=self.config["path"], # for detecting changes token=self.config["token"], # for detecting changes runner_bin_url=None, - runner_image_url=None, ) self.proxies: ProxySetting = {} @@ -503,7 +502,9 @@ def _on_config_changed(self, _event: ConfigChangedEvent) -> None: self._refresh_firewall() try: self._event_timer.ensure_event_timer( - "reconcile-runners", self.config["reconcile-interval"] + event_name="reconcile-runners", + interval=int(self.config["reconcile-interval"]), + timeout=int(self.config["reconcile-interval"]) - 1, ) except TimerEnableError as ex: logger.exception("Failed to start the event timer") diff --git a/src/event_timer.py b/src/event_timer.py index 826a80bde..67b87e616 100644 --- a/src/event_timer.py +++ b/src/event_timer.py @@ -69,15 +69,21 @@ def ensure_event_timer(self, event_name: str, interval: int, timeout: Optional[i Args: event_name: Name of the juju event to schedule. interval: Number of minutes between emitting each event. + timeout: Timeout for each event handle in minutes. Raises: TimerEnableError: Timer cannot be started. Events will be not emitted. """ + if timeout is not None: + timeout_in_secs = timeout * 60 + else: + timeout_in_secs = interval * 30 + context: EventConfig = { "event": event_name, "interval": interval, "random_delay": interval // 4, - "timeout": timeout or (interval * 30), + "timeout": timeout_in_secs, "unit": self.unit_name, } self._render_event_template("service", event_name, context) diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 000000000..188515554 --- /dev/null +++ b/tests/integration/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. diff --git a/tests/integration/test_charm_metrics.py b/tests/integration/charm_metrics_helpers.py similarity index 50% rename from tests/integration/test_charm_metrics.py rename to tests/integration/charm_metrics_helpers.py index dd25d325d..5b61ca58c 100644 --- a/tests/integration/test_charm_metrics.py +++ b/tests/integration/charm_metrics_helpers.py @@ -1,39 +1,31 @@ -# Copyright 2024 Canonical Ltd. +# Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. -"""Integration tests for metrics/logs.""" +"""Utilities for charm metrics integration tests.""" + + import json import logging from datetime import datetime, timezone from time import sleep -from typing import AsyncIterator -import pytest -import pytest_asyncio import requests from github.Branch import Branch from github.Repository import Repository from github.Workflow import Workflow from juju.application import Application -from juju.model import Model from juju.unit import Unit -import runner_logs from github_type import JobConclusion from metrics import METRICS_LOG_PATH from runner_metrics import PostJobStatus from tests.integration.helpers import ( - DISPATCH_CRASH_TEST_WORKFLOW_FILENAME, DISPATCH_FAILURE_TEST_WORKFLOW_FILENAME, DISPATCH_TEST_WORKFLOW_FILENAME, - ensure_charm_has_runner, get_runner_name, get_runner_names, - reconcile, - run_in_lxd_instance, run_in_unit, ) -from tests.status_name import ACTIVE TEST_WORKFLOW_NAMES = [ "Workflow Dispatch Tests", @@ -43,97 +35,6 @@ JOB_LOG_START_MSG_TEMPLATE = "Job is about to start running on the runner: {runner_name}" -@pytest_asyncio.fixture(scope="module", name="app_integrated") -async def app_integrated_fixture( - model: Model, app_no_runner: Application -) -> AsyncIterator[Application]: - """Setup the charm to be integrated with grafana-agent using the cos-agent integration.""" - await _integrate_apps(app_no_runner, model) - - yield app_no_runner - - -async def _clear_metrics_log(unit: Unit) -> None: - """Clear the metrics log on the unit. - - Args: - unit: The unit to clear the metrics log on. - """ - retcode, _ = await run_in_unit( - unit=unit, - command=f"if [ -f {METRICS_LOG_PATH} ]; then rm {METRICS_LOG_PATH}; fi", - ) - assert retcode == 0, "Failed to clear metrics log" - - -async def _print_loop_device_info(unit: Unit, loop_device: str) -> None: - """Print loop device info on the unit. - - Args: - unit: The unit to print the loop device info on. - loop_device: The loop device to print the info for. - """ - retcode, stdout = await run_in_unit( - unit=unit, - command="sudo losetup -lJ", - ) - assert retcode == 0, f"Failed to get loop devices: {stdout}" - assert stdout is not None, "Failed to get loop devices, no stdout message" - loop_devices_info = json.loads(stdout) - for loop_device_info in loop_devices_info["loopdevices"]: - if loop_device_info["name"] == loop_device: - logging.info("Loop device %s info: %s", loop_device, loop_device_info) - break - else: - logging.info("Loop device %s not found", loop_device) - - -@pytest_asyncio.fixture(scope="function", name="app") -async def app_fixture( - model: Model, app_integrated: Application, loop_device: str -) -> AsyncIterator[Application]: - """Setup and teardown the charm after each test. - - Clear the metrics log before each test. - """ - unit = app_integrated.units[0] - await _clear_metrics_log(unit) - await _print_loop_device_info(unit, loop_device) - yield app_integrated - - -async def _get_metrics_log(unit: Unit) -> str: - """Retrieve the metrics log from the unit. - - Args: - unit: The unit to retrieve the metrics log from. - - Returns: - The metrics log. - """ - retcode, stdout = await run_in_unit( - unit=unit, - command=f"if [ -f {METRICS_LOG_PATH} ]; then cat {METRICS_LOG_PATH}; else echo ''; fi", - ) - assert retcode == 0, f"Failed to get metrics log: {stdout}" - assert stdout is not None, "Failed to get metrics log, no stdout message" - logging.info("Metrics log: %s", stdout) - return stdout.strip() - - -async def _integrate_apps(app: Application, model: Model): - """Integrate the charm with grafana-agent using the cos-agent integration. - - Args: - app: The charm to integrate. - model: The model to deploy the grafana-agent to. - """ - grafana_agent = await model.deploy("grafana-agent", channel="latest/edge") - await model.relate(f"{app.name}:cos-agent", f"{grafana_agent.name}:cos-agent") - await model.wait_for_idle(apps=[app.name], status=ACTIVE) - await model.wait_for_idle(apps=[grafana_agent.name]) - - async def _wait_until_runner_is_used_up(runner_name: str, unit: Unit): """Wait until the runner is used up. @@ -215,6 +116,60 @@ async def _wait_for_workflow_to_start(unit: Unit, workflow: Workflow): assert False, "Timeout while waiting for the workflow to start" +async def clear_metrics_log(unit: Unit) -> None: + """Clear the metrics log on the unit. + + Args: + unit: The unit to clear the metrics log on. + """ + retcode, _ = await run_in_unit( + unit=unit, + command=f"if [ -f {METRICS_LOG_PATH} ]; then rm {METRICS_LOG_PATH}; fi", + ) + assert retcode == 0, "Failed to clear metrics log" + + +async def print_loop_device_info(unit: Unit, loop_device: str) -> None: + """Print loop device info on the unit. + + Args: + unit: The unit to print the loop device info on. + loop_device: The loop device to print the info for. + """ + retcode, stdout = await run_in_unit( + unit=unit, + command="sudo losetup -lJ", + ) + assert retcode == 0, f"Failed to get loop devices: {stdout}" + assert stdout is not None, "Failed to get loop devices, no stdout message" + loop_devices_info = json.loads(stdout) + for loop_device_info in loop_devices_info["loopdevices"]: + if loop_device_info["name"] == loop_device: + logging.info("Loop device %s info: %s", loop_device, loop_device_info) + break + else: + logging.info("Loop device %s not found", loop_device) + + +async def get_metrics_log(unit: Unit) -> str: + """Retrieve the metrics log from the unit. + + Args: + unit: The unit to retrieve the metrics log from. + + Returns: + The metrics log. + """ + retcode, stdout = await run_in_unit( + unit=unit, + command=f"if [ -f {METRICS_LOG_PATH} ]; then cat {METRICS_LOG_PATH}; else echo ''; fi", + ) + assert retcode == 0, f"Failed to get metrics log: {stdout}" + assert stdout is not None, "Failed to get metrics log, no stdout message" + logging.info("Metrics log: %s", stdout) + return stdout.strip() + + async def _cancel_workflow_run(unit: Unit, workflow: Workflow): """Cancel the workflow run. @@ -234,7 +189,7 @@ async def _cancel_workflow_run(unit: Unit, workflow: Workflow): run.cancel() -async def _dispatch_workflow( +async def dispatch_workflow( app: Application, branch: Branch, github_repository: Repository, conclusion: str ): """Dispatch a workflow on a branch for the runner to run. @@ -262,7 +217,7 @@ async def _dispatch_workflow( ) -async def _assert_events_after_reconciliation( +async def assert_events_after_reconciliation( app: Application, github_repository: Repository, post_job_status: PostJobStatus ): """Assert that the RunnerStart, RunnerStop and Reconciliation metric is logged. @@ -274,7 +229,7 @@ async def _assert_events_after_reconciliation( """ unit = app.units[0] - metrics_log = await _get_metrics_log(unit=unit) + metrics_log = await get_metrics_log(unit=unit) log_lines = list(map(lambda line: json.loads(line), metrics_log.splitlines())) events = set(map(lambda line: line.get("event"), log_lines)) assert { @@ -317,7 +272,7 @@ async def _assert_events_after_reconciliation( assert metric_log.get("idle_runners") >= 0 -async def _wait_for_runner_to_be_marked_offline( +async def wait_for_runner_to_be_marked_offline( forked_github_repository: Repository, runner_name: str ): """Wait for the runner to be marked offline or to be non-existent. @@ -341,223 +296,3 @@ async def _wait_for_runner_to_be_marked_offline( break else: assert False, "Timeout while waiting for runner to be marked offline" - - -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_charm_issues_runner_installed_metric(app: Application, model: Model): - """ - arrange: A charm without runners integrated with grafana-agent using the cos-agent integration. - act: Config the charm to contain one runner. - assert: The RunnerInstalled metric is logged. - """ - - await ensure_charm_has_runner(app=app, model=model) - - metrics_log = await _get_metrics_log(app.units[0]) - log_lines = list(map(lambda line: json.loads(line), metrics_log.splitlines())) - events = set(map(lambda line: line.get("event"), log_lines)) - assert "runner_installed" in events, "runner_installed event has not been logged" - - for metric_log in log_lines: - if metric_log.get("event") == "runner_installed": - assert metric_log.get("flavor") == app.name - assert metric_log.get("event") == "runner_installed" - assert metric_log.get("duration") >= 0 - - -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_charm_issues_metrics_after_reconciliation( - model: Model, - app: Application, - forked_github_repository: Repository, - forked_github_branch: Branch, -): - """ - arrange: A properly integrated charm with a runner registered on the fork repo. - act: Dispatch a workflow on a branch for the runner to run. After completion, reconcile. - assert: The RunnerStart, RunnerStop and Reconciliation metric is logged. - The Reconciliation metric has the post job status set to normal. - """ - await app.set_config({"path": forked_github_repository.full_name}) - await ensure_charm_has_runner(app=app, model=model) - - # Clear metrics log to make reconciliation event more predictable - unit = app.units[0] - await _clear_metrics_log(unit) - await _dispatch_workflow( - app=app, - branch=forked_github_branch, - github_repository=forked_github_repository, - conclusion="success", - ) - - # Set the number of virtual machines to 0 to speedup reconciliation - await app.set_config({"virtual-machines": "0"}) - await reconcile(app=app, model=model) - - await _assert_events_after_reconciliation( - app=app, github_repository=forked_github_repository, post_job_status=PostJobStatus.NORMAL - ) - - -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_charm_issues_metrics_for_failed_repo_policy( - model: Model, - app: Application, - forked_github_repository: Repository, - forked_github_branch: Branch, -): - """ - arrange: A properly integrated charm with a runner registered on the fork repo. - act: Dispatch a test workflow that fails the repo-policy check. After completion, reconcile. - assert: The RunnerStart, RunnerStop and Reconciliation metric is logged. - The Reconciliation metric has the post job status set to failure. - """ - await app.set_config({"path": forked_github_repository.full_name}) - await ensure_charm_has_runner(app=app, model=model) - - # Clear metrics log to make reconciliation event more predictable - unit = app.units[0] - await _clear_metrics_log(unit) - await _dispatch_workflow( - app=app, - branch=forked_github_branch, - github_repository=forked_github_repository, - conclusion="failure", - ) - - # Set the number of virtual machines to 0 to speedup reconciliation - await app.set_config({"virtual-machines": "0"}) - await reconcile(app=app, model=model) - - await _assert_events_after_reconciliation( - app=app, - github_repository=forked_github_repository, - post_job_status=PostJobStatus.REPO_POLICY_CHECK_FAILURE, - ) - - -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_charm_issues_metrics_for_abnormal_termination( - model: Model, - app: Application, - forked_github_repository: Repository, - forked_github_branch: Branch, -): - """ - arrange: A properly integrated charm with a runner registered on the fork repo. - act: Dispatch a test workflow and afterwards kill run.sh. After that, reconcile. - assert: The RunnerStart, RunnerStop and Reconciliation metric is logged. - The Reconciliation metric has the post job status set to Abnormal. - """ - await app.set_config({"path": forked_github_repository.full_name}) - await ensure_charm_has_runner(app=app, model=model) - - unit = app.units[0] - - workflow = forked_github_repository.get_workflow( - id_or_file_name=DISPATCH_CRASH_TEST_WORKFLOW_FILENAME - ) - assert workflow.create_dispatch(forked_github_branch, {"runner": app.name}) - - await _wait_for_workflow_to_start(unit, workflow) - - # Make the runner terminate abnormally by killing run.sh - runner_name = await get_runner_name(unit) - kill_run_sh_cmd = "pkill -9 run.sh" - ret_code, _ = await run_in_lxd_instance(unit, runner_name, kill_run_sh_cmd) - assert ret_code == 0, "Failed to kill run.sh" - - # Cancel workflow and wait that the runner is marked offline - # to avoid errors during reconciliation. - await _cancel_workflow_run(unit, workflow) - await _wait_for_runner_to_be_marked_offline(forked_github_repository, runner_name) - - # Set the number of virtual machines to 0 to speedup reconciliation - await app.set_config({"virtual-machines": "0"}) - await reconcile(app=app, model=model) - - await _assert_events_after_reconciliation( - app=app, - github_repository=forked_github_repository, - post_job_status=PostJobStatus.ABNORMAL, - ) - - -async def test_charm_remounts_shared_fs( - model: Model, - app: Application, - forked_github_repository: Repository, - forked_github_branch: Branch, -): - """ - arrange: A properly integrated charm with a runner registered on the fork repo. - act: Dispatch a test workflow and afterwards unmount the shared fs. After that, reconcile. - assert: The RunnerStart, RunnerStop and Reconciliation metric is logged. - """ - await app.set_config({"path": forked_github_repository.full_name}) - await ensure_charm_has_runner(app=app, model=model) - - # Clear metrics log to make reconciliation event more predictable - unit = app.units[0] - runner_name = await get_runner_name(unit) - await _clear_metrics_log(unit) - await _dispatch_workflow( - app=app, - branch=forked_github_branch, - github_repository=forked_github_repository, - conclusion="success", - ) - - # unmount shared fs - await run_in_unit(unit, f"sudo umount /home/ubuntu/runner-fs/{runner_name}") - - # Set the number of virtual machines to 0 to speedup reconciliation - await app.set_config({"virtual-machines": "0"}) - await reconcile(app=app, model=model) - - await _assert_events_after_reconciliation( - app=app, github_repository=forked_github_repository, post_job_status=PostJobStatus.NORMAL - ) - - -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_charm_retrieves_logs_from_unhealthy_runners( - model: Model, - app: Application, -): - """ - arrange: A properly integrated charm with one runner. - act: Kill the start.sh script, which marks the runner as unhealthy. After that, reconcile. - assert: The logs are pulled from the crashed runner. - """ - await ensure_charm_has_runner(app=app, model=model) - - unit = app.units[0] - runner_name = await get_runner_name(unit) - - kill_start_sh_cmd = "pkill -9 start.sh" - ret_code, _ = await run_in_lxd_instance(unit, runner_name, kill_start_sh_cmd) - assert ret_code == 0, "Failed to kill start.sh" - - # Set the number of virtual machines to 0 to avoid to speedup reconciliation. - await app.set_config({"virtual-machines": "0"}) - await reconcile(app=app, model=model) - - ret_code, stdout = await run_in_unit(unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}") - assert ret_code == 0, "Failed to list crashed runner logs" - assert stdout - assert runner_name in stdout, "Failed to find crashed runner log" - - ret_code, stdout = await run_in_unit( - unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}/{runner_name}" - ) - assert ret_code == 0, "Failed to list crashed runner log" - assert stdout - assert "_diag" in stdout, "Failed to find crashed runner diag log" - assert "syslog" in stdout, "Failed to find crashed runner syslog log" diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index b2b257a62..fd173d57c 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -434,3 +434,16 @@ def get_branch(): yield get_branch() branch_ref.delete() + + +@pytest_asyncio.fixture(scope="module", name="app_with_grafana_agent") +async def app_with_grafana_agent_integrated_fixture( + model: Model, app_no_runner: Application +) -> AsyncIterator[Application]: + """Setup the charm to be integrated with grafana-agent using the cos-agent integration.""" + grafana_agent = await model.deploy("grafana-agent", channel="latest/edge") + await model.relate(f"{app_no_runner.name}:cos-agent", f"{grafana_agent.name}:cos-agent") + await model.wait_for_idle(apps=[app_no_runner.name], status=ACTIVE) + await model.wait_for_idle(apps=[grafana_agent.name]) + + yield app_no_runner diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py new file mode 100644 index 000000000..35b4b2ab3 --- /dev/null +++ b/tests/integration/test_charm_metrics_failure.py @@ -0,0 +1,170 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Integration tests for metrics/logs assuming Github workflow failures or a runner crash.""" +from typing import AsyncIterator + +import pytest +import pytest_asyncio +from github.Branch import Branch +from github.Repository import Repository +from juju.application import Application +from juju.model import Model + +import runner_logs +from runner_metrics import PostJobStatus +from tests.integration.charm_metrics_helpers import ( + _cancel_workflow_run, + _wait_for_workflow_to_start, + assert_events_after_reconciliation, + clear_metrics_log, + dispatch_workflow, + print_loop_device_info, + wait_for_runner_to_be_marked_offline, +) +from tests.integration.helpers import ( + DISPATCH_CRASH_TEST_WORKFLOW_FILENAME, + ensure_charm_has_runner, + get_runner_name, + reconcile, + run_in_lxd_instance, + run_in_unit, +) + + +@pytest_asyncio.fixture(scope="function", name="app") +async def app_fixture( + model: Model, app_with_grafana_agent: Application, loop_device: str +) -> AsyncIterator[Application]: + """Setup and teardown the charm after each test. + + Clear the metrics log before each test. + """ + unit = app_with_grafana_agent.units[0] + await clear_metrics_log(unit) + await print_loop_device_info(unit, loop_device) + yield app_with_grafana_agent + + +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_charm_issues_metrics_for_failed_repo_policy( + model: Model, + app: Application, + forked_github_repository: Repository, + forked_github_branch: Branch, +): + """ + arrange: A properly integrated charm with a runner registered on the fork repo. + act: Dispatch a test workflow that fails the repo-policy check. After completion, reconcile. + assert: The RunnerStart, RunnerStop and Reconciliation metric is logged. + The Reconciliation metric has the post job status set to failure. + """ + await app.set_config({"path": forked_github_repository.full_name}) + await ensure_charm_has_runner(app=app, model=model) + + # Clear metrics log to make reconciliation event more predictable + unit = app.units[0] + await clear_metrics_log(unit) + await dispatch_workflow( + app=app, + branch=forked_github_branch, + github_repository=forked_github_repository, + conclusion="failure", + ) + + # Set the number of virtual machines to 0 to speedup reconciliation + await app.set_config({"virtual-machines": "0"}) + await reconcile(app=app, model=model) + + await assert_events_after_reconciliation( + app=app, + github_repository=forked_github_repository, + post_job_status=PostJobStatus.REPO_POLICY_CHECK_FAILURE, + ) + + +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_charm_issues_metrics_for_abnormal_termination( + model: Model, + app: Application, + forked_github_repository: Repository, + forked_github_branch: Branch, +): + """ + arrange: A properly integrated charm with a runner registered on the fork repo. + act: Dispatch a test workflow and afterwards kill run.sh. After that, reconcile. + assert: The RunnerStart, RunnerStop and Reconciliation metric is logged. + The Reconciliation metric has the post job status set to Abnormal. + """ + await app.set_config({"path": forked_github_repository.full_name}) + await ensure_charm_has_runner(app=app, model=model) + + unit = app.units[0] + + workflow = forked_github_repository.get_workflow( + id_or_file_name=DISPATCH_CRASH_TEST_WORKFLOW_FILENAME + ) + assert workflow.create_dispatch(forked_github_branch, {"runner": app.name}) + + await _wait_for_workflow_to_start(unit, workflow) + + # Make the runner terminate abnormally by killing run.sh + runner_name = await get_runner_name(unit) + kill_run_sh_cmd = "pkill -9 run.sh" + ret_code, _ = await run_in_lxd_instance(unit, runner_name, kill_run_sh_cmd) + assert ret_code == 0, "Failed to kill run.sh" + + # Cancel workflow and wait that the runner is marked offline + # to avoid errors during reconciliation. + await _cancel_workflow_run(unit, workflow) + await wait_for_runner_to_be_marked_offline(forked_github_repository, runner_name) + + # Set the number of virtual machines to 0 to speedup reconciliation + await app.set_config({"virtual-machines": "0"}) + await reconcile(app=app, model=model) + + await assert_events_after_reconciliation( + app=app, + github_repository=forked_github_repository, + post_job_status=PostJobStatus.ABNORMAL, + ) + + +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_charm_retrieves_logs_from_unhealthy_runners( + model: Model, + app: Application, +): + """ + arrange: A properly integrated charm with one runner. + act: Kill the start.sh script, which marks the runner as unhealthy. After that, reconcile. + assert: The logs are pulled from the crashed runner. + """ + await ensure_charm_has_runner(app=app, model=model) + + unit = app.units[0] + runner_name = await get_runner_name(unit) + + kill_start_sh_cmd = "pkill -9 start.sh" + ret_code, _ = await run_in_lxd_instance(unit, runner_name, kill_start_sh_cmd) + assert ret_code == 0, "Failed to kill start.sh" + + # Set the number of virtual machines to 0 to avoid to speedup reconciliation. + await app.set_config({"virtual-machines": "0"}) + await reconcile(app=app, model=model) + + ret_code, stdout = await run_in_unit(unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}") + assert ret_code == 0, "Failed to list crashed runner logs" + assert stdout + assert runner_name in stdout, "Failed to find crashed runner log" + + ret_code, stdout = await run_in_unit( + unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}/{runner_name}" + ) + assert ret_code == 0, "Failed to list crashed runner log" + assert stdout + assert "_diag" in stdout, "Failed to find crashed runner diag log" + assert "syslog" in stdout, "Failed to find crashed runner syslog log" diff --git a/tests/integration/test_charm_metrics_success.py b/tests/integration/test_charm_metrics_success.py new file mode 100644 index 000000000..cf433084d --- /dev/null +++ b/tests/integration/test_charm_metrics_success.py @@ -0,0 +1,141 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Integration tests for metrics/logs assuming no Github workflow failures.""" + +import json +from typing import AsyncIterator + +import pytest +import pytest_asyncio +from github.Branch import Branch +from github.Repository import Repository +from juju.application import Application +from juju.model import Model + +from runner_metrics import PostJobStatus +from tests.integration.charm_metrics_helpers import ( + assert_events_after_reconciliation, + clear_metrics_log, + dispatch_workflow, + get_metrics_log, + print_loop_device_info, +) +from tests.integration.helpers import ( + ensure_charm_has_runner, + get_runner_name, + reconcile, + run_in_unit, +) + + +@pytest_asyncio.fixture(scope="function", name="app") +async def app_fixture( + model: Model, app_with_grafana_agent: Application, loop_device: str +) -> AsyncIterator[Application]: + """Setup and teardown the charm after each test. + + Clear the metrics log before each test. + """ + unit = app_with_grafana_agent.units[0] + await clear_metrics_log(unit) + await print_loop_device_info(unit, loop_device) + yield app_with_grafana_agent + + +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_charm_issues_runner_installed_metric(app: Application, model: Model): + """ + arrange: A charm without runners integrated with grafana-agent using the cos-agent integration. + act: Config the charm to contain one runner. + assert: The RunnerInstalled metric is logged. + """ + + await ensure_charm_has_runner(app=app, model=model) + + metrics_log = await get_metrics_log(app.units[0]) + log_lines = list(map(lambda line: json.loads(line), metrics_log.splitlines())) + events = set(map(lambda line: line.get("event"), log_lines)) + assert "runner_installed" in events, "runner_installed event has not been logged" + + for metric_log in log_lines: + if metric_log.get("event") == "runner_installed": + assert metric_log.get("flavor") == app.name + assert metric_log.get("event") == "runner_installed" + assert metric_log.get("duration") >= 0 + + +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_charm_issues_metrics_after_reconciliation( + model: Model, + app: Application, + forked_github_repository: Repository, + forked_github_branch: Branch, +): + """ + arrange: A properly integrated charm with a runner registered on the fork repo. + act: Dispatch a workflow on a branch for the runner to run. After completion, reconcile. + assert: The RunnerStart, RunnerStop and Reconciliation metric is logged. + The Reconciliation metric has the post job status set to normal. + """ + await app.set_config({"path": forked_github_repository.full_name}) + await ensure_charm_has_runner(app=app, model=model) + + # Clear metrics log to make reconciliation event more predictable + unit = app.units[0] + await clear_metrics_log(unit) + await dispatch_workflow( + app=app, + branch=forked_github_branch, + github_repository=forked_github_repository, + conclusion="success", + ) + + # Set the number of virtual machines to 0 to speedup reconciliation + await app.set_config({"virtual-machines": "0"}) + await reconcile(app=app, model=model) + + await assert_events_after_reconciliation( + app=app, github_repository=forked_github_repository, post_job_status=PostJobStatus.NORMAL + ) + + +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_charm_remounts_shared_fs( + model: Model, + app: Application, + forked_github_repository: Repository, + forked_github_branch: Branch, +): + """ + arrange: A properly integrated charm with a runner registered on the fork repo. + act: Dispatch a test workflow and afterwards unmount the shared fs. After that, reconcile. + assert: The RunnerStart, RunnerStop and Reconciliation metric is logged. + """ + await app.set_config({"path": forked_github_repository.full_name}) + await ensure_charm_has_runner(app=app, model=model) + + # Clear metrics log to make reconciliation event more predictable + unit = app.units[0] + runner_name = await get_runner_name(unit) + await clear_metrics_log(unit) + await dispatch_workflow( + app=app, + branch=forked_github_branch, + github_repository=forked_github_repository, + conclusion="success", + ) + + # unmount shared fs + await run_in_unit(unit, f"sudo umount /home/ubuntu/runner-fs/{runner_name}") + + # Set the number of virtual machines to 0 to speedup reconciliation + await app.set_config({"virtual-machines": "0"}) + await reconcile(app=app, model=model) + + await assert_events_after_reconciliation( + app=app, github_repository=forked_github_repository, post_job_status=PostJobStatus.NORMAL + ) diff --git a/tests/integration/test_debug_ssh.py b/tests/integration/test_debug_ssh.py index de3aeff5d..c9ef1d480 100644 --- a/tests/integration/test_debug_ssh.py +++ b/tests/integration/test_debug_ssh.py @@ -69,11 +69,19 @@ def is_workflow_complete(): ) zip_data = BytesIO(response.content) with zipfile.ZipFile(zip_data, "r") as zip_ref: + logger.info("Files: %s", zip_ref.namelist()) tmate_log_filename = next( - iter([name for name in zip_ref.namelist() if "Setup tmate session" in name]) + iter( + [ + name + for name in zip_ref.namelist() + if "workflow-dispatch-tests/3_Setup tmate session.txt" == name + ] + ) ) logs = str(zip_ref.read(tmate_log_filename), encoding="utf-8") # ensure ssh connection info printed in logs. + logger.info("Logs: %s", logs) assert tmate_ssh_server_unit_ip in logs, "Tmate ssh server IP not found in action logs." assert "10022" in logs, "Tmate ssh server connection port not found in action logs."