From d0aa351bff74a33250db52511ad490c7088a91e8 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Wed, 17 Jan 2024 19:01:33 +0800 Subject: [PATCH 1/5] Increase reconcile timeout (#191) * Increase reconcile timeout * Event timeout are in mintues --- src-docs/event_timer.py.md | 3 ++- src/charm.py | 5 +++-- src/event_timer.py | 8 +++++++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src-docs/event_timer.py.md b/src-docs/event_timer.py.md index aa0b2290b..24b5c4422 100644 --- a/src-docs/event_timer.py.md +++ b/src-docs/event_timer.py.md @@ -48,7 +48,7 @@ Construct the timer manager. --- - + ### function `disable_event_timer` @@ -96,6 +96,7 @@ The timeout is the number of seconds before an event is timed out. If not set or - `event_name`: Name of the juju event to schedule. - `interval`: Number of minutes between emitting each event. + - `timeout`: Timeout for each event handle in minutes. diff --git a/src/charm.py b/src/charm.py index cba4002f1..ad1a211b9 100755 --- a/src/charm.py +++ b/src/charm.py @@ -163,7 +163,6 @@ def __init__(self, *args, **kargs) -> None: path=self.config["path"], # for detecting changes token=self.config["token"], # for detecting changes runner_bin_url=None, - runner_image_url=None, ) self.proxies: ProxySetting = {} @@ -503,7 +502,9 @@ def _on_config_changed(self, _event: ConfigChangedEvent) -> None: self._refresh_firewall() try: self._event_timer.ensure_event_timer( - "reconcile-runners", self.config["reconcile-interval"] + event_name="reconcile-runners", + interval=int(self.config["reconcile-interval"]), + timeout=int(self.config["reconcile-interval"]) - 1, ) except TimerEnableError as ex: logger.exception("Failed to start the event timer") diff --git a/src/event_timer.py b/src/event_timer.py index 826a80bde..67b87e616 100644 --- a/src/event_timer.py +++ b/src/event_timer.py @@ -69,15 +69,21 @@ def ensure_event_timer(self, event_name: str, interval: int, timeout: Optional[i Args: event_name: Name of the juju event to schedule. interval: Number of minutes between emitting each event. + timeout: Timeout for each event handle in minutes. Raises: TimerEnableError: Timer cannot be started. Events will be not emitted. """ + if timeout is not None: + timeout_in_secs = timeout * 60 + else: + timeout_in_secs = interval * 30 + context: EventConfig = { "event": event_name, "interval": interval, "random_delay": interval // 4, - "timeout": timeout or (interval * 30), + "timeout": timeout_in_secs, "unit": self.unit_name, } self._render_event_template("service", event_name, context) From babe46d6613b00884e1a649f08f08971955d1047 Mon Sep 17 00:00:00 2001 From: yhaliaw <43424755+yhaliaw@users.noreply.github.com> Date: Thu, 18 Jan 2024 10:42:58 +0800 Subject: [PATCH 2/5] Remove previous kernel and clean cache during image build (#190) --- scripts/build-image.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/build-image.sh b/scripts/build-image.sh index 315bcd09c..5fb616feb 100644 --- a/scripts/build-image.sh +++ b/scripts/build-image.sh @@ -87,6 +87,8 @@ retry '/snap/bin/lxc exec builder -- /usr/bin/nslookup github.com' 'Wait for net /snap/bin/lxc exec builder -- /usr/bin/apt-get update /snap/bin/lxc exec builder --env DEBIAN_FRONTEND=noninteractive -- /usr/bin/apt-get upgrade -yq /snap/bin/lxc exec builder --env DEBIAN_FRONTEND=noninteractive -- /usr/bin/apt-get install linux-generic-hwe-22.04 -yq +# This will remove older version of kernel as HWE is installed now. +/snap/bin/lxc exec builder -- /usr/bin/apt-get autoremove --purge /snap/bin/lxc restart builder retry '/snap/bin/lxc exec builder -- /usr/bin/who' 'Wait for lxd agent to be ready' 30 @@ -107,6 +109,10 @@ fi /snap/bin/lxc exec builder -- /usr/sbin/usermod -aG docker ubuntu /snap/bin/lxc exec builder -- /usr/sbin/iptables -I DOCKER-USER -j ACCEPT +# Reduce image size +/snap/bin/lxc exec builder -- /usr/bin/npm cache clean --force +/snap/bin/lxc exec builder -- /usr/bin/apt-get clean + # Download and verify checksum of yq if [[ $(uname -m) == 'aarch64' ]]; then YQ_ARCH="arm64" From 0ba6cfebefbf59c3cd5e23f43febe5b3d00cff60 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Thu, 18 Jan 2024 13:13:09 +0800 Subject: [PATCH 3/5] chore(deps): update actions/cache action to v4 (#197) --- .github/workflows/e2e_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml index 3f549c8ae..a52b62ca9 100644 --- a/.github/workflows/e2e_test.yaml +++ b/.github/workflows/e2e_test.yaml @@ -34,7 +34,7 @@ jobs: EOF - name: Cache github-runner Charm - uses: actions/cache@v3 + uses: actions/cache@v4 id: cache-charm with: path: github-runner_ubuntu-22.04-amd64.charm From 1f11e2e7adc283ca2b7c7d6463fc024994881313 Mon Sep 17 00:00:00 2001 From: Yanks Yoon <37652070+yanksyoon@users.noreply.github.com> Date: Thu, 18 Jan 2024 14:36:50 +0800 Subject: [PATCH 4/5] test: increase debug ssh workflow timeout (#198) * test: add testing workflow yaml * test: allow longer test workflow timeout --- .github/workflows/workflow_dispatch_ssh_debug.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/workflow_dispatch_ssh_debug.yaml b/.github/workflows/workflow_dispatch_ssh_debug.yaml index a44458bc4..e4f388c14 100644 --- a/.github/workflows/workflow_dispatch_ssh_debug.yaml +++ b/.github/workflows/workflow_dispatch_ssh_debug.yaml @@ -14,4 +14,4 @@ jobs: steps: - name: Setup tmate session uses: canonical/action-tmate@chore/env_var_change - timeout-minutes: 1 + timeout-minutes: 5 From 0c284423e828dbd000c41e38532f5adfb80d5be4 Mon Sep 17 00:00:00 2001 From: Christopher Bartz Date: Thu, 18 Jan 2024 11:55:17 +0100 Subject: [PATCH 5/5] Split metrics integration test (#192) * Split metrics integration test * Outcomment other integration tests * outcomment e2e test * Revert "Split metrics integration test" This reverts commit 7f994e66 * fix juju 3.1 * Revert "fix juju 3.1" This reverts commit fd3a27e783e3367c722c2c68ca97c4a73e92c18d. * Revert "Revert "Split metrics integration test"" This reverts commit 9ec7a4e22cf327c5fd153851ba8208be83964e48. * Revert "Revert "Revert "Split metrics integration test""" This reverts commit 2007a249cfee1aa942f164d4ce20dcf06456737c. * Revert "Revert "fix juju 3.1"" This reverts commit a8667e35037c0db9d048d75e0b6bc6323ec28ed4. * Revert previous commits This reverts commit 4160d2040245d1b49628b4ee12beda684c4183b3. Revert "Outcomment other integration tests" This reverts commit d90fb3f4e19f387ba7a93027b9c4f5d2f6a00ab4. Revert "Revert "Revert "Revert "Split metrics integration test"""" This reverts commit cc3ef4446e12e5da95f4151c3f64bb09beab31ff. Revert "Revert "Revert "fix juju 3.1""" This reverts commit a91597d2f8d1300e9c0a5bd9b8585bafff961405. --- .github/workflows/integration_test.yaml | 4 +- tests/integration/__init__.py | 2 + ...rm_metrics.py => charm_metrics_helpers.py} | 389 +++--------------- tests/integration/conftest.py | 14 + .../integration/test_charm_metrics_failure.py | 170 ++++++++ .../integration/test_charm_metrics_success.py | 141 +++++++ 6 files changed, 391 insertions(+), 329 deletions(-) create mode 100644 tests/integration/__init__.py rename tests/integration/{test_charm_metrics.py => charm_metrics_helpers.py} (50%) create mode 100644 tests/integration/test_charm_metrics_failure.py create mode 100644 tests/integration/test_charm_metrics_success.py diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 003d046d7..341728001 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -13,7 +13,7 @@ jobs: pre-run-script: scripts/pre-integration-test.sh provider: lxd test-tox-env: integration-juju2.9 - modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage"]' + modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage"]' integration-tests-juju3: name: Integration test with juju 3.1 uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main @@ -23,4 +23,4 @@ jobs: pre-run-script: scripts/pre-integration-test.sh provider: lxd test-tox-env: integration-juju3.1 - modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage"]' + modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage"]' diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 000000000..188515554 --- /dev/null +++ b/tests/integration/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. diff --git a/tests/integration/test_charm_metrics.py b/tests/integration/charm_metrics_helpers.py similarity index 50% rename from tests/integration/test_charm_metrics.py rename to tests/integration/charm_metrics_helpers.py index dd25d325d..5b61ca58c 100644 --- a/tests/integration/test_charm_metrics.py +++ b/tests/integration/charm_metrics_helpers.py @@ -1,39 +1,31 @@ -# Copyright 2024 Canonical Ltd. +# Copyright 2024 Canonical Ltd. # See LICENSE file for licensing details. -"""Integration tests for metrics/logs.""" +"""Utilities for charm metrics integration tests.""" + + import json import logging from datetime import datetime, timezone from time import sleep -from typing import AsyncIterator -import pytest -import pytest_asyncio import requests from github.Branch import Branch from github.Repository import Repository from github.Workflow import Workflow from juju.application import Application -from juju.model import Model from juju.unit import Unit -import runner_logs from github_type import JobConclusion from metrics import METRICS_LOG_PATH from runner_metrics import PostJobStatus from tests.integration.helpers import ( - DISPATCH_CRASH_TEST_WORKFLOW_FILENAME, DISPATCH_FAILURE_TEST_WORKFLOW_FILENAME, DISPATCH_TEST_WORKFLOW_FILENAME, - ensure_charm_has_runner, get_runner_name, get_runner_names, - reconcile, - run_in_lxd_instance, run_in_unit, ) -from tests.status_name import ACTIVE TEST_WORKFLOW_NAMES = [ "Workflow Dispatch Tests", @@ -43,97 +35,6 @@ JOB_LOG_START_MSG_TEMPLATE = "Job is about to start running on the runner: {runner_name}" -@pytest_asyncio.fixture(scope="module", name="app_integrated") -async def app_integrated_fixture( - model: Model, app_no_runner: Application -) -> AsyncIterator[Application]: - """Setup the charm to be integrated with grafana-agent using the cos-agent integration.""" - await _integrate_apps(app_no_runner, model) - - yield app_no_runner - - -async def _clear_metrics_log(unit: Unit) -> None: - """Clear the metrics log on the unit. - - Args: - unit: The unit to clear the metrics log on. - """ - retcode, _ = await run_in_unit( - unit=unit, - command=f"if [ -f {METRICS_LOG_PATH} ]; then rm {METRICS_LOG_PATH}; fi", - ) - assert retcode == 0, "Failed to clear metrics log" - - -async def _print_loop_device_info(unit: Unit, loop_device: str) -> None: - """Print loop device info on the unit. - - Args: - unit: The unit to print the loop device info on. - loop_device: The loop device to print the info for. - """ - retcode, stdout = await run_in_unit( - unit=unit, - command="sudo losetup -lJ", - ) - assert retcode == 0, f"Failed to get loop devices: {stdout}" - assert stdout is not None, "Failed to get loop devices, no stdout message" - loop_devices_info = json.loads(stdout) - for loop_device_info in loop_devices_info["loopdevices"]: - if loop_device_info["name"] == loop_device: - logging.info("Loop device %s info: %s", loop_device, loop_device_info) - break - else: - logging.info("Loop device %s not found", loop_device) - - -@pytest_asyncio.fixture(scope="function", name="app") -async def app_fixture( - model: Model, app_integrated: Application, loop_device: str -) -> AsyncIterator[Application]: - """Setup and teardown the charm after each test. - - Clear the metrics log before each test. - """ - unit = app_integrated.units[0] - await _clear_metrics_log(unit) - await _print_loop_device_info(unit, loop_device) - yield app_integrated - - -async def _get_metrics_log(unit: Unit) -> str: - """Retrieve the metrics log from the unit. - - Args: - unit: The unit to retrieve the metrics log from. - - Returns: - The metrics log. - """ - retcode, stdout = await run_in_unit( - unit=unit, - command=f"if [ -f {METRICS_LOG_PATH} ]; then cat {METRICS_LOG_PATH}; else echo ''; fi", - ) - assert retcode == 0, f"Failed to get metrics log: {stdout}" - assert stdout is not None, "Failed to get metrics log, no stdout message" - logging.info("Metrics log: %s", stdout) - return stdout.strip() - - -async def _integrate_apps(app: Application, model: Model): - """Integrate the charm with grafana-agent using the cos-agent integration. - - Args: - app: The charm to integrate. - model: The model to deploy the grafana-agent to. - """ - grafana_agent = await model.deploy("grafana-agent", channel="latest/edge") - await model.relate(f"{app.name}:cos-agent", f"{grafana_agent.name}:cos-agent") - await model.wait_for_idle(apps=[app.name], status=ACTIVE) - await model.wait_for_idle(apps=[grafana_agent.name]) - - async def _wait_until_runner_is_used_up(runner_name: str, unit: Unit): """Wait until the runner is used up. @@ -215,6 +116,60 @@ async def _wait_for_workflow_to_start(unit: Unit, workflow: Workflow): assert False, "Timeout while waiting for the workflow to start" +async def clear_metrics_log(unit: Unit) -> None: + """Clear the metrics log on the unit. + + Args: + unit: The unit to clear the metrics log on. + """ + retcode, _ = await run_in_unit( + unit=unit, + command=f"if [ -f {METRICS_LOG_PATH} ]; then rm {METRICS_LOG_PATH}; fi", + ) + assert retcode == 0, "Failed to clear metrics log" + + +async def print_loop_device_info(unit: Unit, loop_device: str) -> None: + """Print loop device info on the unit. + + Args: + unit: The unit to print the loop device info on. + loop_device: The loop device to print the info for. + """ + retcode, stdout = await run_in_unit( + unit=unit, + command="sudo losetup -lJ", + ) + assert retcode == 0, f"Failed to get loop devices: {stdout}" + assert stdout is not None, "Failed to get loop devices, no stdout message" + loop_devices_info = json.loads(stdout) + for loop_device_info in loop_devices_info["loopdevices"]: + if loop_device_info["name"] == loop_device: + logging.info("Loop device %s info: %s", loop_device, loop_device_info) + break + else: + logging.info("Loop device %s not found", loop_device) + + +async def get_metrics_log(unit: Unit) -> str: + """Retrieve the metrics log from the unit. + + Args: + unit: The unit to retrieve the metrics log from. + + Returns: + The metrics log. + """ + retcode, stdout = await run_in_unit( + unit=unit, + command=f"if [ -f {METRICS_LOG_PATH} ]; then cat {METRICS_LOG_PATH}; else echo ''; fi", + ) + assert retcode == 0, f"Failed to get metrics log: {stdout}" + assert stdout is not None, "Failed to get metrics log, no stdout message" + logging.info("Metrics log: %s", stdout) + return stdout.strip() + + async def _cancel_workflow_run(unit: Unit, workflow: Workflow): """Cancel the workflow run. @@ -234,7 +189,7 @@ async def _cancel_workflow_run(unit: Unit, workflow: Workflow): run.cancel() -async def _dispatch_workflow( +async def dispatch_workflow( app: Application, branch: Branch, github_repository: Repository, conclusion: str ): """Dispatch a workflow on a branch for the runner to run. @@ -262,7 +217,7 @@ async def _dispatch_workflow( ) -async def _assert_events_after_reconciliation( +async def assert_events_after_reconciliation( app: Application, github_repository: Repository, post_job_status: PostJobStatus ): """Assert that the RunnerStart, RunnerStop and Reconciliation metric is logged. @@ -274,7 +229,7 @@ async def _assert_events_after_reconciliation( """ unit = app.units[0] - metrics_log = await _get_metrics_log(unit=unit) + metrics_log = await get_metrics_log(unit=unit) log_lines = list(map(lambda line: json.loads(line), metrics_log.splitlines())) events = set(map(lambda line: line.get("event"), log_lines)) assert { @@ -317,7 +272,7 @@ async def _assert_events_after_reconciliation( assert metric_log.get("idle_runners") >= 0 -async def _wait_for_runner_to_be_marked_offline( +async def wait_for_runner_to_be_marked_offline( forked_github_repository: Repository, runner_name: str ): """Wait for the runner to be marked offline or to be non-existent. @@ -341,223 +296,3 @@ async def _wait_for_runner_to_be_marked_offline( break else: assert False, "Timeout while waiting for runner to be marked offline" - - -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_charm_issues_runner_installed_metric(app: Application, model: Model): - """ - arrange: A charm without runners integrated with grafana-agent using the cos-agent integration. - act: Config the charm to contain one runner. - assert: The RunnerInstalled metric is logged. - """ - - await ensure_charm_has_runner(app=app, model=model) - - metrics_log = await _get_metrics_log(app.units[0]) - log_lines = list(map(lambda line: json.loads(line), metrics_log.splitlines())) - events = set(map(lambda line: line.get("event"), log_lines)) - assert "runner_installed" in events, "runner_installed event has not been logged" - - for metric_log in log_lines: - if metric_log.get("event") == "runner_installed": - assert metric_log.get("flavor") == app.name - assert metric_log.get("event") == "runner_installed" - assert metric_log.get("duration") >= 0 - - -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_charm_issues_metrics_after_reconciliation( - model: Model, - app: Application, - forked_github_repository: Repository, - forked_github_branch: Branch, -): - """ - arrange: A properly integrated charm with a runner registered on the fork repo. - act: Dispatch a workflow on a branch for the runner to run. After completion, reconcile. - assert: The RunnerStart, RunnerStop and Reconciliation metric is logged. - The Reconciliation metric has the post job status set to normal. - """ - await app.set_config({"path": forked_github_repository.full_name}) - await ensure_charm_has_runner(app=app, model=model) - - # Clear metrics log to make reconciliation event more predictable - unit = app.units[0] - await _clear_metrics_log(unit) - await _dispatch_workflow( - app=app, - branch=forked_github_branch, - github_repository=forked_github_repository, - conclusion="success", - ) - - # Set the number of virtual machines to 0 to speedup reconciliation - await app.set_config({"virtual-machines": "0"}) - await reconcile(app=app, model=model) - - await _assert_events_after_reconciliation( - app=app, github_repository=forked_github_repository, post_job_status=PostJobStatus.NORMAL - ) - - -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_charm_issues_metrics_for_failed_repo_policy( - model: Model, - app: Application, - forked_github_repository: Repository, - forked_github_branch: Branch, -): - """ - arrange: A properly integrated charm with a runner registered on the fork repo. - act: Dispatch a test workflow that fails the repo-policy check. After completion, reconcile. - assert: The RunnerStart, RunnerStop and Reconciliation metric is logged. - The Reconciliation metric has the post job status set to failure. - """ - await app.set_config({"path": forked_github_repository.full_name}) - await ensure_charm_has_runner(app=app, model=model) - - # Clear metrics log to make reconciliation event more predictable - unit = app.units[0] - await _clear_metrics_log(unit) - await _dispatch_workflow( - app=app, - branch=forked_github_branch, - github_repository=forked_github_repository, - conclusion="failure", - ) - - # Set the number of virtual machines to 0 to speedup reconciliation - await app.set_config({"virtual-machines": "0"}) - await reconcile(app=app, model=model) - - await _assert_events_after_reconciliation( - app=app, - github_repository=forked_github_repository, - post_job_status=PostJobStatus.REPO_POLICY_CHECK_FAILURE, - ) - - -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_charm_issues_metrics_for_abnormal_termination( - model: Model, - app: Application, - forked_github_repository: Repository, - forked_github_branch: Branch, -): - """ - arrange: A properly integrated charm with a runner registered on the fork repo. - act: Dispatch a test workflow and afterwards kill run.sh. After that, reconcile. - assert: The RunnerStart, RunnerStop and Reconciliation metric is logged. - The Reconciliation metric has the post job status set to Abnormal. - """ - await app.set_config({"path": forked_github_repository.full_name}) - await ensure_charm_has_runner(app=app, model=model) - - unit = app.units[0] - - workflow = forked_github_repository.get_workflow( - id_or_file_name=DISPATCH_CRASH_TEST_WORKFLOW_FILENAME - ) - assert workflow.create_dispatch(forked_github_branch, {"runner": app.name}) - - await _wait_for_workflow_to_start(unit, workflow) - - # Make the runner terminate abnormally by killing run.sh - runner_name = await get_runner_name(unit) - kill_run_sh_cmd = "pkill -9 run.sh" - ret_code, _ = await run_in_lxd_instance(unit, runner_name, kill_run_sh_cmd) - assert ret_code == 0, "Failed to kill run.sh" - - # Cancel workflow and wait that the runner is marked offline - # to avoid errors during reconciliation. - await _cancel_workflow_run(unit, workflow) - await _wait_for_runner_to_be_marked_offline(forked_github_repository, runner_name) - - # Set the number of virtual machines to 0 to speedup reconciliation - await app.set_config({"virtual-machines": "0"}) - await reconcile(app=app, model=model) - - await _assert_events_after_reconciliation( - app=app, - github_repository=forked_github_repository, - post_job_status=PostJobStatus.ABNORMAL, - ) - - -async def test_charm_remounts_shared_fs( - model: Model, - app: Application, - forked_github_repository: Repository, - forked_github_branch: Branch, -): - """ - arrange: A properly integrated charm with a runner registered on the fork repo. - act: Dispatch a test workflow and afterwards unmount the shared fs. After that, reconcile. - assert: The RunnerStart, RunnerStop and Reconciliation metric is logged. - """ - await app.set_config({"path": forked_github_repository.full_name}) - await ensure_charm_has_runner(app=app, model=model) - - # Clear metrics log to make reconciliation event more predictable - unit = app.units[0] - runner_name = await get_runner_name(unit) - await _clear_metrics_log(unit) - await _dispatch_workflow( - app=app, - branch=forked_github_branch, - github_repository=forked_github_repository, - conclusion="success", - ) - - # unmount shared fs - await run_in_unit(unit, f"sudo umount /home/ubuntu/runner-fs/{runner_name}") - - # Set the number of virtual machines to 0 to speedup reconciliation - await app.set_config({"virtual-machines": "0"}) - await reconcile(app=app, model=model) - - await _assert_events_after_reconciliation( - app=app, github_repository=forked_github_repository, post_job_status=PostJobStatus.NORMAL - ) - - -@pytest.mark.asyncio -@pytest.mark.abort_on_fail -async def test_charm_retrieves_logs_from_unhealthy_runners( - model: Model, - app: Application, -): - """ - arrange: A properly integrated charm with one runner. - act: Kill the start.sh script, which marks the runner as unhealthy. After that, reconcile. - assert: The logs are pulled from the crashed runner. - """ - await ensure_charm_has_runner(app=app, model=model) - - unit = app.units[0] - runner_name = await get_runner_name(unit) - - kill_start_sh_cmd = "pkill -9 start.sh" - ret_code, _ = await run_in_lxd_instance(unit, runner_name, kill_start_sh_cmd) - assert ret_code == 0, "Failed to kill start.sh" - - # Set the number of virtual machines to 0 to avoid to speedup reconciliation. - await app.set_config({"virtual-machines": "0"}) - await reconcile(app=app, model=model) - - ret_code, stdout = await run_in_unit(unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}") - assert ret_code == 0, "Failed to list crashed runner logs" - assert stdout - assert runner_name in stdout, "Failed to find crashed runner log" - - ret_code, stdout = await run_in_unit( - unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}/{runner_name}" - ) - assert ret_code == 0, "Failed to list crashed runner log" - assert stdout - assert "_diag" in stdout, "Failed to find crashed runner diag log" - assert "syslog" in stdout, "Failed to find crashed runner syslog log" diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index c23f88c18..f41e8e606 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -24,6 +24,7 @@ ensure_charm_has_runner, reconcile, ) +from tests.status_name import ACTIVE @pytest.fixture(scope="module") @@ -354,3 +355,16 @@ async def app_juju_storage( reconcile_interval=60, ) return application + + +@pytest_asyncio.fixture(scope="module", name="app_with_grafana_agent") +async def app_with_grafana_agent_integrated_fixture( + model: Model, app_no_runner: Application +) -> AsyncIterator[Application]: + """Setup the charm to be integrated with grafana-agent using the cos-agent integration.""" + grafana_agent = await model.deploy("grafana-agent", channel="latest/edge") + await model.relate(f"{app_no_runner.name}:cos-agent", f"{grafana_agent.name}:cos-agent") + await model.wait_for_idle(apps=[app_no_runner.name], status=ACTIVE) + await model.wait_for_idle(apps=[grafana_agent.name]) + + yield app_no_runner diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py new file mode 100644 index 000000000..35b4b2ab3 --- /dev/null +++ b/tests/integration/test_charm_metrics_failure.py @@ -0,0 +1,170 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Integration tests for metrics/logs assuming Github workflow failures or a runner crash.""" +from typing import AsyncIterator + +import pytest +import pytest_asyncio +from github.Branch import Branch +from github.Repository import Repository +from juju.application import Application +from juju.model import Model + +import runner_logs +from runner_metrics import PostJobStatus +from tests.integration.charm_metrics_helpers import ( + _cancel_workflow_run, + _wait_for_workflow_to_start, + assert_events_after_reconciliation, + clear_metrics_log, + dispatch_workflow, + print_loop_device_info, + wait_for_runner_to_be_marked_offline, +) +from tests.integration.helpers import ( + DISPATCH_CRASH_TEST_WORKFLOW_FILENAME, + ensure_charm_has_runner, + get_runner_name, + reconcile, + run_in_lxd_instance, + run_in_unit, +) + + +@pytest_asyncio.fixture(scope="function", name="app") +async def app_fixture( + model: Model, app_with_grafana_agent: Application, loop_device: str +) -> AsyncIterator[Application]: + """Setup and teardown the charm after each test. + + Clear the metrics log before each test. + """ + unit = app_with_grafana_agent.units[0] + await clear_metrics_log(unit) + await print_loop_device_info(unit, loop_device) + yield app_with_grafana_agent + + +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_charm_issues_metrics_for_failed_repo_policy( + model: Model, + app: Application, + forked_github_repository: Repository, + forked_github_branch: Branch, +): + """ + arrange: A properly integrated charm with a runner registered on the fork repo. + act: Dispatch a test workflow that fails the repo-policy check. After completion, reconcile. + assert: The RunnerStart, RunnerStop and Reconciliation metric is logged. + The Reconciliation metric has the post job status set to failure. + """ + await app.set_config({"path": forked_github_repository.full_name}) + await ensure_charm_has_runner(app=app, model=model) + + # Clear metrics log to make reconciliation event more predictable + unit = app.units[0] + await clear_metrics_log(unit) + await dispatch_workflow( + app=app, + branch=forked_github_branch, + github_repository=forked_github_repository, + conclusion="failure", + ) + + # Set the number of virtual machines to 0 to speedup reconciliation + await app.set_config({"virtual-machines": "0"}) + await reconcile(app=app, model=model) + + await assert_events_after_reconciliation( + app=app, + github_repository=forked_github_repository, + post_job_status=PostJobStatus.REPO_POLICY_CHECK_FAILURE, + ) + + +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_charm_issues_metrics_for_abnormal_termination( + model: Model, + app: Application, + forked_github_repository: Repository, + forked_github_branch: Branch, +): + """ + arrange: A properly integrated charm with a runner registered on the fork repo. + act: Dispatch a test workflow and afterwards kill run.sh. After that, reconcile. + assert: The RunnerStart, RunnerStop and Reconciliation metric is logged. + The Reconciliation metric has the post job status set to Abnormal. + """ + await app.set_config({"path": forked_github_repository.full_name}) + await ensure_charm_has_runner(app=app, model=model) + + unit = app.units[0] + + workflow = forked_github_repository.get_workflow( + id_or_file_name=DISPATCH_CRASH_TEST_WORKFLOW_FILENAME + ) + assert workflow.create_dispatch(forked_github_branch, {"runner": app.name}) + + await _wait_for_workflow_to_start(unit, workflow) + + # Make the runner terminate abnormally by killing run.sh + runner_name = await get_runner_name(unit) + kill_run_sh_cmd = "pkill -9 run.sh" + ret_code, _ = await run_in_lxd_instance(unit, runner_name, kill_run_sh_cmd) + assert ret_code == 0, "Failed to kill run.sh" + + # Cancel workflow and wait that the runner is marked offline + # to avoid errors during reconciliation. + await _cancel_workflow_run(unit, workflow) + await wait_for_runner_to_be_marked_offline(forked_github_repository, runner_name) + + # Set the number of virtual machines to 0 to speedup reconciliation + await app.set_config({"virtual-machines": "0"}) + await reconcile(app=app, model=model) + + await assert_events_after_reconciliation( + app=app, + github_repository=forked_github_repository, + post_job_status=PostJobStatus.ABNORMAL, + ) + + +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_charm_retrieves_logs_from_unhealthy_runners( + model: Model, + app: Application, +): + """ + arrange: A properly integrated charm with one runner. + act: Kill the start.sh script, which marks the runner as unhealthy. After that, reconcile. + assert: The logs are pulled from the crashed runner. + """ + await ensure_charm_has_runner(app=app, model=model) + + unit = app.units[0] + runner_name = await get_runner_name(unit) + + kill_start_sh_cmd = "pkill -9 start.sh" + ret_code, _ = await run_in_lxd_instance(unit, runner_name, kill_start_sh_cmd) + assert ret_code == 0, "Failed to kill start.sh" + + # Set the number of virtual machines to 0 to avoid to speedup reconciliation. + await app.set_config({"virtual-machines": "0"}) + await reconcile(app=app, model=model) + + ret_code, stdout = await run_in_unit(unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}") + assert ret_code == 0, "Failed to list crashed runner logs" + assert stdout + assert runner_name in stdout, "Failed to find crashed runner log" + + ret_code, stdout = await run_in_unit( + unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}/{runner_name}" + ) + assert ret_code == 0, "Failed to list crashed runner log" + assert stdout + assert "_diag" in stdout, "Failed to find crashed runner diag log" + assert "syslog" in stdout, "Failed to find crashed runner syslog log" diff --git a/tests/integration/test_charm_metrics_success.py b/tests/integration/test_charm_metrics_success.py new file mode 100644 index 000000000..cf433084d --- /dev/null +++ b/tests/integration/test_charm_metrics_success.py @@ -0,0 +1,141 @@ +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Integration tests for metrics/logs assuming no Github workflow failures.""" + +import json +from typing import AsyncIterator + +import pytest +import pytest_asyncio +from github.Branch import Branch +from github.Repository import Repository +from juju.application import Application +from juju.model import Model + +from runner_metrics import PostJobStatus +from tests.integration.charm_metrics_helpers import ( + assert_events_after_reconciliation, + clear_metrics_log, + dispatch_workflow, + get_metrics_log, + print_loop_device_info, +) +from tests.integration.helpers import ( + ensure_charm_has_runner, + get_runner_name, + reconcile, + run_in_unit, +) + + +@pytest_asyncio.fixture(scope="function", name="app") +async def app_fixture( + model: Model, app_with_grafana_agent: Application, loop_device: str +) -> AsyncIterator[Application]: + """Setup and teardown the charm after each test. + + Clear the metrics log before each test. + """ + unit = app_with_grafana_agent.units[0] + await clear_metrics_log(unit) + await print_loop_device_info(unit, loop_device) + yield app_with_grafana_agent + + +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_charm_issues_runner_installed_metric(app: Application, model: Model): + """ + arrange: A charm without runners integrated with grafana-agent using the cos-agent integration. + act: Config the charm to contain one runner. + assert: The RunnerInstalled metric is logged. + """ + + await ensure_charm_has_runner(app=app, model=model) + + metrics_log = await get_metrics_log(app.units[0]) + log_lines = list(map(lambda line: json.loads(line), metrics_log.splitlines())) + events = set(map(lambda line: line.get("event"), log_lines)) + assert "runner_installed" in events, "runner_installed event has not been logged" + + for metric_log in log_lines: + if metric_log.get("event") == "runner_installed": + assert metric_log.get("flavor") == app.name + assert metric_log.get("event") == "runner_installed" + assert metric_log.get("duration") >= 0 + + +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_charm_issues_metrics_after_reconciliation( + model: Model, + app: Application, + forked_github_repository: Repository, + forked_github_branch: Branch, +): + """ + arrange: A properly integrated charm with a runner registered on the fork repo. + act: Dispatch a workflow on a branch for the runner to run. After completion, reconcile. + assert: The RunnerStart, RunnerStop and Reconciliation metric is logged. + The Reconciliation metric has the post job status set to normal. + """ + await app.set_config({"path": forked_github_repository.full_name}) + await ensure_charm_has_runner(app=app, model=model) + + # Clear metrics log to make reconciliation event more predictable + unit = app.units[0] + await clear_metrics_log(unit) + await dispatch_workflow( + app=app, + branch=forked_github_branch, + github_repository=forked_github_repository, + conclusion="success", + ) + + # Set the number of virtual machines to 0 to speedup reconciliation + await app.set_config({"virtual-machines": "0"}) + await reconcile(app=app, model=model) + + await assert_events_after_reconciliation( + app=app, github_repository=forked_github_repository, post_job_status=PostJobStatus.NORMAL + ) + + +@pytest.mark.asyncio +@pytest.mark.abort_on_fail +async def test_charm_remounts_shared_fs( + model: Model, + app: Application, + forked_github_repository: Repository, + forked_github_branch: Branch, +): + """ + arrange: A properly integrated charm with a runner registered on the fork repo. + act: Dispatch a test workflow and afterwards unmount the shared fs. After that, reconcile. + assert: The RunnerStart, RunnerStop and Reconciliation metric is logged. + """ + await app.set_config({"path": forked_github_repository.full_name}) + await ensure_charm_has_runner(app=app, model=model) + + # Clear metrics log to make reconciliation event more predictable + unit = app.units[0] + runner_name = await get_runner_name(unit) + await clear_metrics_log(unit) + await dispatch_workflow( + app=app, + branch=forked_github_branch, + github_repository=forked_github_repository, + conclusion="success", + ) + + # unmount shared fs + await run_in_unit(unit, f"sudo umount /home/ubuntu/runner-fs/{runner_name}") + + # Set the number of virtual machines to 0 to speedup reconciliation + await app.set_config({"virtual-machines": "0"}) + await reconcile(app=app, model=model) + + await assert_events_after_reconciliation( + app=app, github_repository=forked_github_repository, post_job_status=PostJobStatus.NORMAL + )