diff --git a/.github/workflows/e2e_test.yaml b/.github/workflows/e2e_test.yaml
index 3f549c8ae..a52b62ca9 100644
--- a/.github/workflows/e2e_test.yaml
+++ b/.github/workflows/e2e_test.yaml
@@ -34,7 +34,7 @@ jobs:
EOF
- name: Cache github-runner Charm
- uses: actions/cache@v3
+ uses: actions/cache@v4
id: cache-charm
with:
path: github-runner_ubuntu-22.04-amd64.charm
diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml
index c058dd3bf..b6bdea75a 100644
--- a/.github/workflows/integration_test.yaml
+++ b/.github/workflows/integration_test.yaml
@@ -13,7 +13,7 @@ jobs:
pre-run-script: scripts/pre-integration-test.sh
provider: lxd
test-tox-env: integration-juju2.9
- modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]'
+ modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]'
integration-tests-juju3:
name: Integration test with juju 3.1
uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
@@ -23,4 +23,4 @@ jobs:
pre-run-script: scripts/pre-integration-test.sh
provider: lxd
test-tox-env: integration-juju3.1
- modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]'
+ modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]'
diff --git a/.github/workflows/workflow_dispatch_ssh_debug.yaml b/.github/workflows/workflow_dispatch_ssh_debug.yaml
index a44458bc4..e4f388c14 100644
--- a/.github/workflows/workflow_dispatch_ssh_debug.yaml
+++ b/.github/workflows/workflow_dispatch_ssh_debug.yaml
@@ -14,4 +14,4 @@ jobs:
steps:
- name: Setup tmate session
uses: canonical/action-tmate@chore/env_var_change
- timeout-minutes: 1
+ timeout-minutes: 5
diff --git a/scripts/build-image.sh b/scripts/build-image.sh
index 315bcd09c..5fb616feb 100644
--- a/scripts/build-image.sh
+++ b/scripts/build-image.sh
@@ -87,6 +87,8 @@ retry '/snap/bin/lxc exec builder -- /usr/bin/nslookup github.com' 'Wait for net
/snap/bin/lxc exec builder -- /usr/bin/apt-get update
/snap/bin/lxc exec builder --env DEBIAN_FRONTEND=noninteractive -- /usr/bin/apt-get upgrade -yq
/snap/bin/lxc exec builder --env DEBIAN_FRONTEND=noninteractive -- /usr/bin/apt-get install linux-generic-hwe-22.04 -yq
+# This will remove older version of kernel as HWE is installed now.
+/snap/bin/lxc exec builder -- /usr/bin/apt-get autoremove --purge
/snap/bin/lxc restart builder
retry '/snap/bin/lxc exec builder -- /usr/bin/who' 'Wait for lxd agent to be ready' 30
@@ -107,6 +109,10 @@ fi
/snap/bin/lxc exec builder -- /usr/sbin/usermod -aG docker ubuntu
/snap/bin/lxc exec builder -- /usr/sbin/iptables -I DOCKER-USER -j ACCEPT
+# Reduce image size
+/snap/bin/lxc exec builder -- /usr/bin/npm cache clean --force
+/snap/bin/lxc exec builder -- /usr/bin/apt-get clean
+
# Download and verify checksum of yq
if [[ $(uname -m) == 'aarch64' ]]; then
YQ_ARCH="arm64"
diff --git a/src-docs/event_timer.py.md b/src-docs/event_timer.py.md
index aa0b2290b..24b5c4422 100644
--- a/src-docs/event_timer.py.md
+++ b/src-docs/event_timer.py.md
@@ -48,7 +48,7 @@ Construct the timer manager.
---
-
+
### function `disable_event_timer`
@@ -96,6 +96,7 @@ The timeout is the number of seconds before an event is timed out. If not set or
- `event_name`: Name of the juju event to schedule.
- `interval`: Number of minutes between emitting each event.
+ - `timeout`: Timeout for each event handle in minutes.
diff --git a/src/charm.py b/src/charm.py
index 87e9fb828..a058a85ac 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -163,7 +163,6 @@ def __init__(self, *args, **kargs) -> None:
path=self.config["path"], # for detecting changes
token=self.config["token"], # for detecting changes
runner_bin_url=None,
- runner_image_url=None,
)
self.proxies: ProxySetting = {}
@@ -503,7 +502,9 @@ def _on_config_changed(self, _event: ConfigChangedEvent) -> None:
self._refresh_firewall()
try:
self._event_timer.ensure_event_timer(
- "reconcile-runners", self.config["reconcile-interval"]
+ event_name="reconcile-runners",
+ interval=int(self.config["reconcile-interval"]),
+ timeout=int(self.config["reconcile-interval"]) - 1,
)
except TimerEnableError as ex:
logger.exception("Failed to start the event timer")
diff --git a/src/event_timer.py b/src/event_timer.py
index 826a80bde..67b87e616 100644
--- a/src/event_timer.py
+++ b/src/event_timer.py
@@ -69,15 +69,21 @@ def ensure_event_timer(self, event_name: str, interval: int, timeout: Optional[i
Args:
event_name: Name of the juju event to schedule.
interval: Number of minutes between emitting each event.
+ timeout: Timeout for each event handle in minutes.
Raises:
TimerEnableError: Timer cannot be started. Events will be not emitted.
"""
+ if timeout is not None:
+ timeout_in_secs = timeout * 60
+ else:
+ timeout_in_secs = interval * 30
+
context: EventConfig = {
"event": event_name,
"interval": interval,
"random_delay": interval // 4,
- "timeout": timeout or (interval * 30),
+ "timeout": timeout_in_secs,
"unit": self.unit_name,
}
self._render_event_template("service", event_name, context)
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
new file mode 100644
index 000000000..188515554
--- /dev/null
+++ b/tests/integration/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2024 Canonical Ltd.
+# See LICENSE file for licensing details.
diff --git a/tests/integration/test_charm_metrics.py b/tests/integration/charm_metrics_helpers.py
similarity index 50%
rename from tests/integration/test_charm_metrics.py
rename to tests/integration/charm_metrics_helpers.py
index dd25d325d..5b61ca58c 100644
--- a/tests/integration/test_charm_metrics.py
+++ b/tests/integration/charm_metrics_helpers.py
@@ -1,39 +1,31 @@
-# Copyright 2024 Canonical Ltd.
+# Copyright 2024 Canonical Ltd.
# See LICENSE file for licensing details.
-"""Integration tests for metrics/logs."""
+"""Utilities for charm metrics integration tests."""
+
+
import json
import logging
from datetime import datetime, timezone
from time import sleep
-from typing import AsyncIterator
-import pytest
-import pytest_asyncio
import requests
from github.Branch import Branch
from github.Repository import Repository
from github.Workflow import Workflow
from juju.application import Application
-from juju.model import Model
from juju.unit import Unit
-import runner_logs
from github_type import JobConclusion
from metrics import METRICS_LOG_PATH
from runner_metrics import PostJobStatus
from tests.integration.helpers import (
- DISPATCH_CRASH_TEST_WORKFLOW_FILENAME,
DISPATCH_FAILURE_TEST_WORKFLOW_FILENAME,
DISPATCH_TEST_WORKFLOW_FILENAME,
- ensure_charm_has_runner,
get_runner_name,
get_runner_names,
- reconcile,
- run_in_lxd_instance,
run_in_unit,
)
-from tests.status_name import ACTIVE
TEST_WORKFLOW_NAMES = [
"Workflow Dispatch Tests",
@@ -43,97 +35,6 @@
JOB_LOG_START_MSG_TEMPLATE = "Job is about to start running on the runner: {runner_name}"
-@pytest_asyncio.fixture(scope="module", name="app_integrated")
-async def app_integrated_fixture(
- model: Model, app_no_runner: Application
-) -> AsyncIterator[Application]:
- """Setup the charm to be integrated with grafana-agent using the cos-agent integration."""
- await _integrate_apps(app_no_runner, model)
-
- yield app_no_runner
-
-
-async def _clear_metrics_log(unit: Unit) -> None:
- """Clear the metrics log on the unit.
-
- Args:
- unit: The unit to clear the metrics log on.
- """
- retcode, _ = await run_in_unit(
- unit=unit,
- command=f"if [ -f {METRICS_LOG_PATH} ]; then rm {METRICS_LOG_PATH}; fi",
- )
- assert retcode == 0, "Failed to clear metrics log"
-
-
-async def _print_loop_device_info(unit: Unit, loop_device: str) -> None:
- """Print loop device info on the unit.
-
- Args:
- unit: The unit to print the loop device info on.
- loop_device: The loop device to print the info for.
- """
- retcode, stdout = await run_in_unit(
- unit=unit,
- command="sudo losetup -lJ",
- )
- assert retcode == 0, f"Failed to get loop devices: {stdout}"
- assert stdout is not None, "Failed to get loop devices, no stdout message"
- loop_devices_info = json.loads(stdout)
- for loop_device_info in loop_devices_info["loopdevices"]:
- if loop_device_info["name"] == loop_device:
- logging.info("Loop device %s info: %s", loop_device, loop_device_info)
- break
- else:
- logging.info("Loop device %s not found", loop_device)
-
-
-@pytest_asyncio.fixture(scope="function", name="app")
-async def app_fixture(
- model: Model, app_integrated: Application, loop_device: str
-) -> AsyncIterator[Application]:
- """Setup and teardown the charm after each test.
-
- Clear the metrics log before each test.
- """
- unit = app_integrated.units[0]
- await _clear_metrics_log(unit)
- await _print_loop_device_info(unit, loop_device)
- yield app_integrated
-
-
-async def _get_metrics_log(unit: Unit) -> str:
- """Retrieve the metrics log from the unit.
-
- Args:
- unit: The unit to retrieve the metrics log from.
-
- Returns:
- The metrics log.
- """
- retcode, stdout = await run_in_unit(
- unit=unit,
- command=f"if [ -f {METRICS_LOG_PATH} ]; then cat {METRICS_LOG_PATH}; else echo ''; fi",
- )
- assert retcode == 0, f"Failed to get metrics log: {stdout}"
- assert stdout is not None, "Failed to get metrics log, no stdout message"
- logging.info("Metrics log: %s", stdout)
- return stdout.strip()
-
-
-async def _integrate_apps(app: Application, model: Model):
- """Integrate the charm with grafana-agent using the cos-agent integration.
-
- Args:
- app: The charm to integrate.
- model: The model to deploy the grafana-agent to.
- """
- grafana_agent = await model.deploy("grafana-agent", channel="latest/edge")
- await model.relate(f"{app.name}:cos-agent", f"{grafana_agent.name}:cos-agent")
- await model.wait_for_idle(apps=[app.name], status=ACTIVE)
- await model.wait_for_idle(apps=[grafana_agent.name])
-
-
async def _wait_until_runner_is_used_up(runner_name: str, unit: Unit):
"""Wait until the runner is used up.
@@ -215,6 +116,60 @@ async def _wait_for_workflow_to_start(unit: Unit, workflow: Workflow):
assert False, "Timeout while waiting for the workflow to start"
+async def clear_metrics_log(unit: Unit) -> None:
+ """Clear the metrics log on the unit.
+
+ Args:
+ unit: The unit to clear the metrics log on.
+ """
+ retcode, _ = await run_in_unit(
+ unit=unit,
+ command=f"if [ -f {METRICS_LOG_PATH} ]; then rm {METRICS_LOG_PATH}; fi",
+ )
+ assert retcode == 0, "Failed to clear metrics log"
+
+
+async def print_loop_device_info(unit: Unit, loop_device: str) -> None:
+ """Print loop device info on the unit.
+
+ Args:
+ unit: The unit to print the loop device info on.
+ loop_device: The loop device to print the info for.
+ """
+ retcode, stdout = await run_in_unit(
+ unit=unit,
+ command="sudo losetup -lJ",
+ )
+ assert retcode == 0, f"Failed to get loop devices: {stdout}"
+ assert stdout is not None, "Failed to get loop devices, no stdout message"
+ loop_devices_info = json.loads(stdout)
+ for loop_device_info in loop_devices_info["loopdevices"]:
+ if loop_device_info["name"] == loop_device:
+ logging.info("Loop device %s info: %s", loop_device, loop_device_info)
+ break
+ else:
+ logging.info("Loop device %s not found", loop_device)
+
+
+async def get_metrics_log(unit: Unit) -> str:
+ """Retrieve the metrics log from the unit.
+
+ Args:
+ unit: The unit to retrieve the metrics log from.
+
+ Returns:
+ The metrics log.
+ """
+ retcode, stdout = await run_in_unit(
+ unit=unit,
+ command=f"if [ -f {METRICS_LOG_PATH} ]; then cat {METRICS_LOG_PATH}; else echo ''; fi",
+ )
+ assert retcode == 0, f"Failed to get metrics log: {stdout}"
+ assert stdout is not None, "Failed to get metrics log, no stdout message"
+ logging.info("Metrics log: %s", stdout)
+ return stdout.strip()
+
+
async def _cancel_workflow_run(unit: Unit, workflow: Workflow):
"""Cancel the workflow run.
@@ -234,7 +189,7 @@ async def _cancel_workflow_run(unit: Unit, workflow: Workflow):
run.cancel()
-async def _dispatch_workflow(
+async def dispatch_workflow(
app: Application, branch: Branch, github_repository: Repository, conclusion: str
):
"""Dispatch a workflow on a branch for the runner to run.
@@ -262,7 +217,7 @@ async def _dispatch_workflow(
)
-async def _assert_events_after_reconciliation(
+async def assert_events_after_reconciliation(
app: Application, github_repository: Repository, post_job_status: PostJobStatus
):
"""Assert that the RunnerStart, RunnerStop and Reconciliation metric is logged.
@@ -274,7 +229,7 @@ async def _assert_events_after_reconciliation(
"""
unit = app.units[0]
- metrics_log = await _get_metrics_log(unit=unit)
+ metrics_log = await get_metrics_log(unit=unit)
log_lines = list(map(lambda line: json.loads(line), metrics_log.splitlines()))
events = set(map(lambda line: line.get("event"), log_lines))
assert {
@@ -317,7 +272,7 @@ async def _assert_events_after_reconciliation(
assert metric_log.get("idle_runners") >= 0
-async def _wait_for_runner_to_be_marked_offline(
+async def wait_for_runner_to_be_marked_offline(
forked_github_repository: Repository, runner_name: str
):
"""Wait for the runner to be marked offline or to be non-existent.
@@ -341,223 +296,3 @@ async def _wait_for_runner_to_be_marked_offline(
break
else:
assert False, "Timeout while waiting for runner to be marked offline"
-
-
-@pytest.mark.asyncio
-@pytest.mark.abort_on_fail
-async def test_charm_issues_runner_installed_metric(app: Application, model: Model):
- """
- arrange: A charm without runners integrated with grafana-agent using the cos-agent integration.
- act: Config the charm to contain one runner.
- assert: The RunnerInstalled metric is logged.
- """
-
- await ensure_charm_has_runner(app=app, model=model)
-
- metrics_log = await _get_metrics_log(app.units[0])
- log_lines = list(map(lambda line: json.loads(line), metrics_log.splitlines()))
- events = set(map(lambda line: line.get("event"), log_lines))
- assert "runner_installed" in events, "runner_installed event has not been logged"
-
- for metric_log in log_lines:
- if metric_log.get("event") == "runner_installed":
- assert metric_log.get("flavor") == app.name
- assert metric_log.get("event") == "runner_installed"
- assert metric_log.get("duration") >= 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.abort_on_fail
-async def test_charm_issues_metrics_after_reconciliation(
- model: Model,
- app: Application,
- forked_github_repository: Repository,
- forked_github_branch: Branch,
-):
- """
- arrange: A properly integrated charm with a runner registered on the fork repo.
- act: Dispatch a workflow on a branch for the runner to run. After completion, reconcile.
- assert: The RunnerStart, RunnerStop and Reconciliation metric is logged.
- The Reconciliation metric has the post job status set to normal.
- """
- await app.set_config({"path": forked_github_repository.full_name})
- await ensure_charm_has_runner(app=app, model=model)
-
- # Clear metrics log to make reconciliation event more predictable
- unit = app.units[0]
- await _clear_metrics_log(unit)
- await _dispatch_workflow(
- app=app,
- branch=forked_github_branch,
- github_repository=forked_github_repository,
- conclusion="success",
- )
-
- # Set the number of virtual machines to 0 to speedup reconciliation
- await app.set_config({"virtual-machines": "0"})
- await reconcile(app=app, model=model)
-
- await _assert_events_after_reconciliation(
- app=app, github_repository=forked_github_repository, post_job_status=PostJobStatus.NORMAL
- )
-
-
-@pytest.mark.asyncio
-@pytest.mark.abort_on_fail
-async def test_charm_issues_metrics_for_failed_repo_policy(
- model: Model,
- app: Application,
- forked_github_repository: Repository,
- forked_github_branch: Branch,
-):
- """
- arrange: A properly integrated charm with a runner registered on the fork repo.
- act: Dispatch a test workflow that fails the repo-policy check. After completion, reconcile.
- assert: The RunnerStart, RunnerStop and Reconciliation metric is logged.
- The Reconciliation metric has the post job status set to failure.
- """
- await app.set_config({"path": forked_github_repository.full_name})
- await ensure_charm_has_runner(app=app, model=model)
-
- # Clear metrics log to make reconciliation event more predictable
- unit = app.units[0]
- await _clear_metrics_log(unit)
- await _dispatch_workflow(
- app=app,
- branch=forked_github_branch,
- github_repository=forked_github_repository,
- conclusion="failure",
- )
-
- # Set the number of virtual machines to 0 to speedup reconciliation
- await app.set_config({"virtual-machines": "0"})
- await reconcile(app=app, model=model)
-
- await _assert_events_after_reconciliation(
- app=app,
- github_repository=forked_github_repository,
- post_job_status=PostJobStatus.REPO_POLICY_CHECK_FAILURE,
- )
-
-
-@pytest.mark.asyncio
-@pytest.mark.abort_on_fail
-async def test_charm_issues_metrics_for_abnormal_termination(
- model: Model,
- app: Application,
- forked_github_repository: Repository,
- forked_github_branch: Branch,
-):
- """
- arrange: A properly integrated charm with a runner registered on the fork repo.
- act: Dispatch a test workflow and afterwards kill run.sh. After that, reconcile.
- assert: The RunnerStart, RunnerStop and Reconciliation metric is logged.
- The Reconciliation metric has the post job status set to Abnormal.
- """
- await app.set_config({"path": forked_github_repository.full_name})
- await ensure_charm_has_runner(app=app, model=model)
-
- unit = app.units[0]
-
- workflow = forked_github_repository.get_workflow(
- id_or_file_name=DISPATCH_CRASH_TEST_WORKFLOW_FILENAME
- )
- assert workflow.create_dispatch(forked_github_branch, {"runner": app.name})
-
- await _wait_for_workflow_to_start(unit, workflow)
-
- # Make the runner terminate abnormally by killing run.sh
- runner_name = await get_runner_name(unit)
- kill_run_sh_cmd = "pkill -9 run.sh"
- ret_code, _ = await run_in_lxd_instance(unit, runner_name, kill_run_sh_cmd)
- assert ret_code == 0, "Failed to kill run.sh"
-
- # Cancel workflow and wait that the runner is marked offline
- # to avoid errors during reconciliation.
- await _cancel_workflow_run(unit, workflow)
- await _wait_for_runner_to_be_marked_offline(forked_github_repository, runner_name)
-
- # Set the number of virtual machines to 0 to speedup reconciliation
- await app.set_config({"virtual-machines": "0"})
- await reconcile(app=app, model=model)
-
- await _assert_events_after_reconciliation(
- app=app,
- github_repository=forked_github_repository,
- post_job_status=PostJobStatus.ABNORMAL,
- )
-
-
-async def test_charm_remounts_shared_fs(
- model: Model,
- app: Application,
- forked_github_repository: Repository,
- forked_github_branch: Branch,
-):
- """
- arrange: A properly integrated charm with a runner registered on the fork repo.
- act: Dispatch a test workflow and afterwards unmount the shared fs. After that, reconcile.
- assert: The RunnerStart, RunnerStop and Reconciliation metric is logged.
- """
- await app.set_config({"path": forked_github_repository.full_name})
- await ensure_charm_has_runner(app=app, model=model)
-
- # Clear metrics log to make reconciliation event more predictable
- unit = app.units[0]
- runner_name = await get_runner_name(unit)
- await _clear_metrics_log(unit)
- await _dispatch_workflow(
- app=app,
- branch=forked_github_branch,
- github_repository=forked_github_repository,
- conclusion="success",
- )
-
- # unmount shared fs
- await run_in_unit(unit, f"sudo umount /home/ubuntu/runner-fs/{runner_name}")
-
- # Set the number of virtual machines to 0 to speedup reconciliation
- await app.set_config({"virtual-machines": "0"})
- await reconcile(app=app, model=model)
-
- await _assert_events_after_reconciliation(
- app=app, github_repository=forked_github_repository, post_job_status=PostJobStatus.NORMAL
- )
-
-
-@pytest.mark.asyncio
-@pytest.mark.abort_on_fail
-async def test_charm_retrieves_logs_from_unhealthy_runners(
- model: Model,
- app: Application,
-):
- """
- arrange: A properly integrated charm with one runner.
- act: Kill the start.sh script, which marks the runner as unhealthy. After that, reconcile.
- assert: The logs are pulled from the crashed runner.
- """
- await ensure_charm_has_runner(app=app, model=model)
-
- unit = app.units[0]
- runner_name = await get_runner_name(unit)
-
- kill_start_sh_cmd = "pkill -9 start.sh"
- ret_code, _ = await run_in_lxd_instance(unit, runner_name, kill_start_sh_cmd)
- assert ret_code == 0, "Failed to kill start.sh"
-
- # Set the number of virtual machines to 0 to avoid to speedup reconciliation.
- await app.set_config({"virtual-machines": "0"})
- await reconcile(app=app, model=model)
-
- ret_code, stdout = await run_in_unit(unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}")
- assert ret_code == 0, "Failed to list crashed runner logs"
- assert stdout
- assert runner_name in stdout, "Failed to find crashed runner log"
-
- ret_code, stdout = await run_in_unit(
- unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}/{runner_name}"
- )
- assert ret_code == 0, "Failed to list crashed runner log"
- assert stdout
- assert "_diag" in stdout, "Failed to find crashed runner diag log"
- assert "syslog" in stdout, "Failed to find crashed runner syslog log"
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index b2b257a62..fd173d57c 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -434,3 +434,16 @@ def get_branch():
yield get_branch()
branch_ref.delete()
+
+
+@pytest_asyncio.fixture(scope="module", name="app_with_grafana_agent")
+async def app_with_grafana_agent_integrated_fixture(
+ model: Model, app_no_runner: Application
+) -> AsyncIterator[Application]:
+ """Setup the charm to be integrated with grafana-agent using the cos-agent integration."""
+ grafana_agent = await model.deploy("grafana-agent", channel="latest/edge")
+ await model.relate(f"{app_no_runner.name}:cos-agent", f"{grafana_agent.name}:cos-agent")
+ await model.wait_for_idle(apps=[app_no_runner.name], status=ACTIVE)
+ await model.wait_for_idle(apps=[grafana_agent.name])
+
+ yield app_no_runner
diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py
new file mode 100644
index 000000000..35b4b2ab3
--- /dev/null
+++ b/tests/integration/test_charm_metrics_failure.py
@@ -0,0 +1,170 @@
+# Copyright 2024 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+"""Integration tests for metrics/logs assuming Github workflow failures or a runner crash."""
+from typing import AsyncIterator
+
+import pytest
+import pytest_asyncio
+from github.Branch import Branch
+from github.Repository import Repository
+from juju.application import Application
+from juju.model import Model
+
+import runner_logs
+from runner_metrics import PostJobStatus
+from tests.integration.charm_metrics_helpers import (
+ _cancel_workflow_run,
+ _wait_for_workflow_to_start,
+ assert_events_after_reconciliation,
+ clear_metrics_log,
+ dispatch_workflow,
+ print_loop_device_info,
+ wait_for_runner_to_be_marked_offline,
+)
+from tests.integration.helpers import (
+ DISPATCH_CRASH_TEST_WORKFLOW_FILENAME,
+ ensure_charm_has_runner,
+ get_runner_name,
+ reconcile,
+ run_in_lxd_instance,
+ run_in_unit,
+)
+
+
+@pytest_asyncio.fixture(scope="function", name="app")
+async def app_fixture(
+ model: Model, app_with_grafana_agent: Application, loop_device: str
+) -> AsyncIterator[Application]:
+ """Setup and teardown the charm after each test.
+
+ Clear the metrics log before each test.
+ """
+ unit = app_with_grafana_agent.units[0]
+ await clear_metrics_log(unit)
+ await print_loop_device_info(unit, loop_device)
+ yield app_with_grafana_agent
+
+
+@pytest.mark.asyncio
+@pytest.mark.abort_on_fail
+async def test_charm_issues_metrics_for_failed_repo_policy(
+ model: Model,
+ app: Application,
+ forked_github_repository: Repository,
+ forked_github_branch: Branch,
+):
+ """
+ arrange: A properly integrated charm with a runner registered on the fork repo.
+ act: Dispatch a test workflow that fails the repo-policy check. After completion, reconcile.
+ assert: The RunnerStart, RunnerStop and Reconciliation metric is logged.
+ The Reconciliation metric has the post job status set to failure.
+ """
+ await app.set_config({"path": forked_github_repository.full_name})
+ await ensure_charm_has_runner(app=app, model=model)
+
+ # Clear metrics log to make reconciliation event more predictable
+ unit = app.units[0]
+ await clear_metrics_log(unit)
+ await dispatch_workflow(
+ app=app,
+ branch=forked_github_branch,
+ github_repository=forked_github_repository,
+ conclusion="failure",
+ )
+
+ # Set the number of virtual machines to 0 to speedup reconciliation
+ await app.set_config({"virtual-machines": "0"})
+ await reconcile(app=app, model=model)
+
+ await assert_events_after_reconciliation(
+ app=app,
+ github_repository=forked_github_repository,
+ post_job_status=PostJobStatus.REPO_POLICY_CHECK_FAILURE,
+ )
+
+
+@pytest.mark.asyncio
+@pytest.mark.abort_on_fail
+async def test_charm_issues_metrics_for_abnormal_termination(
+ model: Model,
+ app: Application,
+ forked_github_repository: Repository,
+ forked_github_branch: Branch,
+):
+ """
+ arrange: A properly integrated charm with a runner registered on the fork repo.
+ act: Dispatch a test workflow and afterwards kill run.sh. After that, reconcile.
+ assert: The RunnerStart, RunnerStop and Reconciliation metric is logged.
+ The Reconciliation metric has the post job status set to Abnormal.
+ """
+ await app.set_config({"path": forked_github_repository.full_name})
+ await ensure_charm_has_runner(app=app, model=model)
+
+ unit = app.units[0]
+
+ workflow = forked_github_repository.get_workflow(
+ id_or_file_name=DISPATCH_CRASH_TEST_WORKFLOW_FILENAME
+ )
+ assert workflow.create_dispatch(forked_github_branch, {"runner": app.name})
+
+ await _wait_for_workflow_to_start(unit, workflow)
+
+ # Make the runner terminate abnormally by killing run.sh
+ runner_name = await get_runner_name(unit)
+ kill_run_sh_cmd = "pkill -9 run.sh"
+ ret_code, _ = await run_in_lxd_instance(unit, runner_name, kill_run_sh_cmd)
+ assert ret_code == 0, "Failed to kill run.sh"
+
+ # Cancel workflow and wait that the runner is marked offline
+ # to avoid errors during reconciliation.
+ await _cancel_workflow_run(unit, workflow)
+ await wait_for_runner_to_be_marked_offline(forked_github_repository, runner_name)
+
+ # Set the number of virtual machines to 0 to speedup reconciliation
+ await app.set_config({"virtual-machines": "0"})
+ await reconcile(app=app, model=model)
+
+ await assert_events_after_reconciliation(
+ app=app,
+ github_repository=forked_github_repository,
+ post_job_status=PostJobStatus.ABNORMAL,
+ )
+
+
+@pytest.mark.asyncio
+@pytest.mark.abort_on_fail
+async def test_charm_retrieves_logs_from_unhealthy_runners(
+ model: Model,
+ app: Application,
+):
+ """
+ arrange: A properly integrated charm with one runner.
+ act: Kill the start.sh script, which marks the runner as unhealthy. After that, reconcile.
+ assert: The logs are pulled from the crashed runner.
+ """
+ await ensure_charm_has_runner(app=app, model=model)
+
+ unit = app.units[0]
+ runner_name = await get_runner_name(unit)
+
+ kill_start_sh_cmd = "pkill -9 start.sh"
+ ret_code, _ = await run_in_lxd_instance(unit, runner_name, kill_start_sh_cmd)
+ assert ret_code == 0, "Failed to kill start.sh"
+
+ # Set the number of virtual machines to 0 to avoid to speedup reconciliation.
+ await app.set_config({"virtual-machines": "0"})
+ await reconcile(app=app, model=model)
+
+ ret_code, stdout = await run_in_unit(unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}")
+ assert ret_code == 0, "Failed to list crashed runner logs"
+ assert stdout
+ assert runner_name in stdout, "Failed to find crashed runner log"
+
+ ret_code, stdout = await run_in_unit(
+ unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}/{runner_name}"
+ )
+ assert ret_code == 0, "Failed to list crashed runner log"
+ assert stdout
+ assert "_diag" in stdout, "Failed to find crashed runner diag log"
+ assert "syslog" in stdout, "Failed to find crashed runner syslog log"
diff --git a/tests/integration/test_charm_metrics_success.py b/tests/integration/test_charm_metrics_success.py
new file mode 100644
index 000000000..cf433084d
--- /dev/null
+++ b/tests/integration/test_charm_metrics_success.py
@@ -0,0 +1,141 @@
+# Copyright 2024 Canonical Ltd.
+# See LICENSE file for licensing details.
+
+"""Integration tests for metrics/logs assuming no Github workflow failures."""
+
+import json
+from typing import AsyncIterator
+
+import pytest
+import pytest_asyncio
+from github.Branch import Branch
+from github.Repository import Repository
+from juju.application import Application
+from juju.model import Model
+
+from runner_metrics import PostJobStatus
+from tests.integration.charm_metrics_helpers import (
+ assert_events_after_reconciliation,
+ clear_metrics_log,
+ dispatch_workflow,
+ get_metrics_log,
+ print_loop_device_info,
+)
+from tests.integration.helpers import (
+ ensure_charm_has_runner,
+ get_runner_name,
+ reconcile,
+ run_in_unit,
+)
+
+
+@pytest_asyncio.fixture(scope="function", name="app")
+async def app_fixture(
+ model: Model, app_with_grafana_agent: Application, loop_device: str
+) -> AsyncIterator[Application]:
+ """Setup and teardown the charm after each test.
+
+ Clear the metrics log before each test.
+ """
+ unit = app_with_grafana_agent.units[0]
+ await clear_metrics_log(unit)
+ await print_loop_device_info(unit, loop_device)
+ yield app_with_grafana_agent
+
+
+@pytest.mark.asyncio
+@pytest.mark.abort_on_fail
+async def test_charm_issues_runner_installed_metric(app: Application, model: Model):
+ """
+ arrange: A charm without runners integrated with grafana-agent using the cos-agent integration.
+ act: Config the charm to contain one runner.
+ assert: The RunnerInstalled metric is logged.
+ """
+
+ await ensure_charm_has_runner(app=app, model=model)
+
+ metrics_log = await get_metrics_log(app.units[0])
+ log_lines = list(map(lambda line: json.loads(line), metrics_log.splitlines()))
+ events = set(map(lambda line: line.get("event"), log_lines))
+ assert "runner_installed" in events, "runner_installed event has not been logged"
+
+ for metric_log in log_lines:
+ if metric_log.get("event") == "runner_installed":
+ assert metric_log.get("flavor") == app.name
+ assert metric_log.get("event") == "runner_installed"
+ assert metric_log.get("duration") >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.abort_on_fail
+async def test_charm_issues_metrics_after_reconciliation(
+ model: Model,
+ app: Application,
+ forked_github_repository: Repository,
+ forked_github_branch: Branch,
+):
+ """
+ arrange: A properly integrated charm with a runner registered on the fork repo.
+ act: Dispatch a workflow on a branch for the runner to run. After completion, reconcile.
+ assert: The RunnerStart, RunnerStop and Reconciliation metric is logged.
+ The Reconciliation metric has the post job status set to normal.
+ """
+ await app.set_config({"path": forked_github_repository.full_name})
+ await ensure_charm_has_runner(app=app, model=model)
+
+ # Clear metrics log to make reconciliation event more predictable
+ unit = app.units[0]
+ await clear_metrics_log(unit)
+ await dispatch_workflow(
+ app=app,
+ branch=forked_github_branch,
+ github_repository=forked_github_repository,
+ conclusion="success",
+ )
+
+ # Set the number of virtual machines to 0 to speedup reconciliation
+ await app.set_config({"virtual-machines": "0"})
+ await reconcile(app=app, model=model)
+
+ await assert_events_after_reconciliation(
+ app=app, github_repository=forked_github_repository, post_job_status=PostJobStatus.NORMAL
+ )
+
+
+@pytest.mark.asyncio
+@pytest.mark.abort_on_fail
+async def test_charm_remounts_shared_fs(
+ model: Model,
+ app: Application,
+ forked_github_repository: Repository,
+ forked_github_branch: Branch,
+):
+ """
+ arrange: A properly integrated charm with a runner registered on the fork repo.
+ act: Dispatch a test workflow and afterwards unmount the shared fs. After that, reconcile.
+ assert: The RunnerStart, RunnerStop and Reconciliation metric is logged.
+ """
+ await app.set_config({"path": forked_github_repository.full_name})
+ await ensure_charm_has_runner(app=app, model=model)
+
+ # Clear metrics log to make reconciliation event more predictable
+ unit = app.units[0]
+ runner_name = await get_runner_name(unit)
+ await clear_metrics_log(unit)
+ await dispatch_workflow(
+ app=app,
+ branch=forked_github_branch,
+ github_repository=forked_github_repository,
+ conclusion="success",
+ )
+
+ # unmount shared fs
+ await run_in_unit(unit, f"sudo umount /home/ubuntu/runner-fs/{runner_name}")
+
+ # Set the number of virtual machines to 0 to speedup reconciliation
+ await app.set_config({"virtual-machines": "0"})
+ await reconcile(app=app, model=model)
+
+ await assert_events_after_reconciliation(
+ app=app, github_repository=forked_github_repository, post_job_status=PostJobStatus.NORMAL
+ )
diff --git a/tests/integration/test_debug_ssh.py b/tests/integration/test_debug_ssh.py
index de3aeff5d..c9ef1d480 100644
--- a/tests/integration/test_debug_ssh.py
+++ b/tests/integration/test_debug_ssh.py
@@ -69,11 +69,19 @@ def is_workflow_complete():
)
zip_data = BytesIO(response.content)
with zipfile.ZipFile(zip_data, "r") as zip_ref:
+ logger.info("Files: %s", zip_ref.namelist())
tmate_log_filename = next(
- iter([name for name in zip_ref.namelist() if "Setup tmate session" in name])
+ iter(
+ [
+ name
+ for name in zip_ref.namelist()
+ if "workflow-dispatch-tests/3_Setup tmate session.txt" == name
+ ]
+ )
)
logs = str(zip_ref.read(tmate_log_filename), encoding="utf-8")
# ensure ssh connection info printed in logs.
+ logger.info("Logs: %s", logs)
assert tmate_ssh_server_unit_ip in logs, "Tmate ssh server IP not found in action logs."
assert "10022" in logs, "Tmate ssh server connection port not found in action logs."