From c066df39f3cd794c2573566b41b7dd12c910c03d Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Mon, 25 Mar 2024 13:59:41 +0800
Subject: [PATCH 01/19] github exception fix

---
 tests/integration/charm_metrics_helpers.py    | 42 ++++++++++++-------
 .../integration/test_charm_metrics_failure.py |  2 +-
 2 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/tests/integration/charm_metrics_helpers.py b/tests/integration/charm_metrics_helpers.py
index ac4602a0b..aad98742d 100644
--- a/tests/integration/charm_metrics_helpers.py
+++ b/tests/integration/charm_metrics_helpers.py
@@ -9,15 +9,20 @@
 from time import sleep
 
 import requests
+from github.Branch import Branch
+from github.GithubException import GithubException
 from github.Repository import Repository
 from github.Workflow import Workflow
+from github.WorkflowJob import WorkflowJob
 from juju.application import Application
 from juju.unit import Unit
 
 from github_type import JobConclusion
 from metrics import METRICS_LOG_PATH
 from runner_metrics import PostJobStatus
-from tests.integration.helpers import get_runner_name, run_in_unit
+from tests.integration.helpers import get_runner_name, run_in_unit, wait_for
+
+logger = logging.getLogger(__name__)
 
 TEST_WORKFLOW_NAMES = [
     "Workflow Dispatch Tests",
@@ -26,7 +31,7 @@
 ]
 
 
-async def wait_for_workflow_to_start(unit: Unit, workflow: Workflow):
+async def wait_for_workflow_to_start(unit: Unit, workflow: Workflow, branch: Branch | None = None):
     """Wait for the workflow to start.
 
     Args:
@@ -34,21 +39,28 @@ async def wait_for_workflow_to_start(unit: Unit, workflow: Workflow):
         workflow: The workflow to wait for.
     """
     runner_name = await get_runner_name(unit)
-    for _ in range(30):
-        for run in workflow.get_runs():
+
+    def is_runner_log():
+        """Return whether a log for given runner exists."""
+        for run in workflow.get_runs(branch=branch):
             jobs = run.jobs()
-            if jobs:
-                logs_url = jobs[0].logs_url()
-                logs = requests.get(logs_url).content.decode("utf-8")
+            if not jobs:
+                return False
+            try:
+                job: WorkflowJob = jobs[0]
+                logs = requests.get(job.logs_url()).content.decode("utf-8")
+            except GithubException as exc:
+                if exc.status == 410:
+                    logger.warning("Transient github error, %s", exc)
+                    return False
+            if runner_name in logs:
+                return True
+        return False
 
-                if runner_name in logs:
-                    break
-        else:
-            sleep(30)
-            continue
-        break
-    else:
-        assert False, "Timeout while waiting for the workflow to start"
+    try:
+        await wait_for(is_runner_log, timeout=20 * 60, check_interval=30)
+    except TimeoutError as exc:
+        raise TimeoutError("Timeout while waiting for the workflow to start") from exc
 
 
 async def clear_metrics_log(unit: Unit) -> None:
diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py
index 032b29b1e..45777eaa4 100644
--- a/tests/integration/test_charm_metrics_failure.py
+++ b/tests/integration/test_charm_metrics_failure.py
@@ -110,7 +110,7 @@ async def test_charm_issues_metrics_for_abnormal_termination(
     )
     assert workflow.create_dispatch(forked_github_branch, {"runner": app.name})
 
-    await wait_for_workflow_to_start(unit, workflow)
+    await wait_for_workflow_to_start(unit, workflow, branch=forked_github_branch)
 
     # Make the runner terminate abnormally by killing run.sh
     runner_name = await get_runner_name(unit)

From 9f4e7ec9a3c1ac7534c2d9ad7ddd76527a97de5e Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Tue, 26 Mar 2024 19:25:43 +0800
Subject: [PATCH 02/19] log inactive subprocess error

---
 src/charm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/charm.py b/src/charm.py
index e595d195a..0d9e72086 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -286,8 +286,8 @@ def _ensure_service_health(self) -> None:
         logger.info("Checking health of repo-policy-compliance service")
         try:
             execute_command(["/usr/bin/systemctl", "is-active", "repo-policy-compliance"])
-        except SubprocessError:
-            logger.exception("Found inactive repo-policy-compliance service")
+        except SubprocessError as exc:
+            logger.exception("Found inactive repo-policy-compliance service, %s", exc)
             execute_command(["/usr/bin/systemctl", "restart", "repo-policy-compliance"])
             logger.info("Restart repo-policy-compliance service")
             raise

From 10e0751d34a50ed3c2c02eb4010d1784282ed9f8 Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Tue, 26 Mar 2024 21:11:40 +0800
Subject: [PATCH 03/19] try upload-artifact fix

---
 .github/workflows/integration_test.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml
index 07a1ca231..0b07f17f6 100644
--- a/.github/workflows/integration_test.yaml
+++ b/.github/workflows/integration_test.yaml
@@ -8,7 +8,7 @@ jobs:
   # INTEGRATION_TEST_ARGS to operator-workflows automatically.
   integration-tests-juju2:
     name: Integration test with juju 2.9
-    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
+    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@fix/upload-artifact
     secrets: inherit
     with:
       juju-channel: 2.9/stable
@@ -18,7 +18,7 @@ jobs:
       modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]'
   integration-tests:
     name: Integration test with juju 3.1
-    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
+    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@fix/upload-artifact
     secrets: inherit
     with:
       juju-channel: 3.1/stable
@@ -33,7 +33,7 @@ jobs:
   # - we need to disable the rbac addon for microk8s, otherwise the setup will fail
   integration-tests-microstack:
     name: Integration test using microstack
-    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
+    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@fix/upload-artifact
     secrets: inherit
     with:
       juju-channel: 3.2/stable

From 879c06e296d9f71cd56362bed2ad27e1ffd8fabc Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Tue, 26 Mar 2024 21:17:21 +0800
Subject: [PATCH 04/19] github error on cancel workflow run

---
 tests/integration/charm_metrics_helpers.py    | 23 ++++++++++++-------
 .../integration/test_charm_metrics_failure.py |  2 +-
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/tests/integration/charm_metrics_helpers.py b/tests/integration/charm_metrics_helpers.py
index aad98742d..87daba667 100644
--- a/tests/integration/charm_metrics_helpers.py
+++ b/tests/integration/charm_metrics_helpers.py
@@ -37,6 +37,7 @@ async def wait_for_workflow_to_start(unit: Unit, workflow: Workflow, branch: Bra
     Args:
         unit: The unit which contains the runner.
         workflow: The workflow to wait for.
+        branch: The branch where the workflow belongs to.
     """
     runner_name = await get_runner_name(unit)
 
@@ -117,23 +118,29 @@ async def get_metrics_log(unit: Unit) -> str:
     return stdout.strip()
 
 
-async def cancel_workflow_run(unit: Unit, workflow: Workflow):
+async def cancel_workflow_run(unit: Unit, workflow: Workflow, branch: Branch | None = None):
     """Cancel the workflow run.
 
     Args:
         unit: The unit which contains the runner.
         workflow: The workflow to cancel the workflow run for.
+        branch: The branch where the workflow belongs to.
     """
     runner_name = await get_runner_name(unit)
 
-    for run in workflow.get_runs():
+    for run in workflow.get_runs(branch=branch):
         jobs = run.jobs()
-        if jobs:
-            logs_url = jobs[0].logs_url()
-            logs = requests.get(logs_url).content.decode("utf-8")
-
-            if runner_name in logs:
-                run.cancel()
+        if not jobs:
+            continue
+        try:
+            job: WorkflowJob = jobs[0]
+            logs = requests.get(job.logs_url()).content.decode("utf-8")
+        except GithubException as exc:
+            if exc.status == 410:
+                logger.warning("Transient github error, %s", exc)
+                continue
+        if runner_name in logs:
+            run.cancel()
 
 
 async def assert_events_after_reconciliation(
diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py
index 45777eaa4..379c8d1ae 100644
--- a/tests/integration/test_charm_metrics_failure.py
+++ b/tests/integration/test_charm_metrics_failure.py
@@ -120,7 +120,7 @@ async def test_charm_issues_metrics_for_abnormal_termination(
 
     # Cancel workflow and wait that the runner is marked offline
     # to avoid errors during reconciliation.
-    await cancel_workflow_run(unit, workflow)
+    await cancel_workflow_run(unit, workflow, branch=forked_github_branch)
     await wait_for_runner_to_be_marked_offline(forked_github_repository, runner_name)
 
     # Set the number of virtual machines to 0 to speedup reconciliation

From 84cbf74f60f9bf418ad0d176121a089e50e90fcc Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Tue, 26 Mar 2024 22:27:56 +0800
Subject: [PATCH 05/19] add stdout/err hints to help debug test failures

---
 tests/integration/charm_metrics_helpers.py    | 12 ++--
 tests/integration/helpers.py                  | 55 ++++++++++++-------
 .../integration/test_charm_metrics_failure.py | 16 +++---
 tests/integration/test_charm_one_runner.py    | 22 +++++---
 .../test_charm_scheduled_events.py            |  6 +-
 tests/integration/test_charm_with_proxy.py    | 36 +++++++-----
 tests/integration/test_self_hosted_runner.py  |  4 +-
 7 files changed, 90 insertions(+), 61 deletions(-)

diff --git a/tests/integration/charm_metrics_helpers.py b/tests/integration/charm_metrics_helpers.py
index 87daba667..c12d92349 100644
--- a/tests/integration/charm_metrics_helpers.py
+++ b/tests/integration/charm_metrics_helpers.py
@@ -70,11 +70,11 @@ async def clear_metrics_log(unit: Unit) -> None:
     Args:
         unit: The unit to clear the metrics log on.
     """
-    retcode, _ = await run_in_unit(
+    retcode, _, stderr = await run_in_unit(
         unit=unit,
         command=f"if [ -f {METRICS_LOG_PATH} ]; then rm {METRICS_LOG_PATH}; fi",
     )
-    assert retcode == 0, "Failed to clear metrics log"
+    assert retcode == 0, f"Failed to clear metrics log, {stderr}"
 
 
 async def print_loop_device_info(unit: Unit, loop_device: str) -> None:
@@ -84,11 +84,11 @@ async def print_loop_device_info(unit: Unit, loop_device: str) -> None:
         unit: The unit to print the loop device info on.
         loop_device: The loop device to print the info for.
     """
-    retcode, stdout = await run_in_unit(
+    retcode, stdout, stderr = await run_in_unit(
         unit=unit,
         command="sudo losetup -lJ",
     )
-    assert retcode == 0, f"Failed to get loop devices: {stdout}"
+    assert retcode == 0, f"Failed to get loop devices: {stdout} {stderr}"
     assert stdout is not None, "Failed to get loop devices, no stdout message"
     loop_devices_info = json.loads(stdout)
     for loop_device_info in loop_devices_info["loopdevices"]:
@@ -108,11 +108,11 @@ async def get_metrics_log(unit: Unit) -> str:
     Returns:
         The metrics log.
     """
-    retcode, stdout = await run_in_unit(
+    retcode, stdout, stderr = await run_in_unit(
         unit=unit,
         command=f"if [ -f {METRICS_LOG_PATH} ]; then cat {METRICS_LOG_PATH}; else echo ''; fi",
     )
-    assert retcode == 0, f"Failed to get metrics log: {stdout}"
+    assert retcode == 0, f"Failed to get metrics log: {stdout} {stderr}"
     assert stdout is not None, "Failed to get metrics log, no stdout message"
     logging.info("Metrics log: %s", stdout)
     return stdout.strip()
diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py
index aca334a72..04f24bcbd 100644
--- a/tests/integration/helpers.py
+++ b/tests/integration/helpers.py
@@ -22,6 +22,7 @@
 from github.Workflow import Workflow
 from github.WorkflowJob import WorkflowJob
 from github.WorkflowRun import WorkflowRun
+from juju.action import Action
 from juju.application import Application
 from juju.model import Model
 from juju.unit import Unit
@@ -49,7 +50,7 @@ async def check_runner_binary_exists(unit: Unit) -> bool:
     Returns:
         Whether the runner binary file exists in the charm.
     """
-    return_code, _ = await run_in_unit(unit, f"test -f {RunnerManager.runner_bin_path}")
+    return_code, _, _ = await run_in_unit(unit, f"test -f {RunnerManager.runner_bin_path}")
     return return_code == 0
 
 
@@ -62,10 +63,12 @@ async def get_repo_policy_compliance_pip_info(unit: Unit) -> None | str:
     Returns:
         If repo-policy-compliance is installed, returns the pip show output, else returns none.
     """
-    return_code, stdout = await run_in_unit(unit, "python3 -m pip show repo-policy-compliance")
+    return_code, stdout, stderr = await run_in_unit(
+        unit, "python3 -m pip show repo-policy-compliance"
+    )
 
     if return_code == 0:
-        return stdout
+        return stdout or stderr
 
     return None
 
@@ -77,14 +80,16 @@ async def install_repo_policy_compliance_from_git_source(unit: Unit, source: Non
         unit: Unit instance to check for the LXD profile.
         source: The git source to install the package. If none the package is removed.
     """
-    return_code, stdout = await run_in_unit(
+    return_code, stdout, stderr = await run_in_unit(
         unit, "python3 -m pip uninstall --yes repo-policy-compliance"
     )
-    assert return_code == 0, f"Failed to uninstall repo-policy-compliance: {stdout}"
+    assert return_code == 0, f"Failed to uninstall repo-policy-compliance: {stdout} {stderr}"
 
     if source:
-        return_code, _ = await run_in_unit(unit, f"python3 -m pip install {source}")
-        assert return_code == 0
+        return_code, stdout, stderr = await run_in_unit(unit, f"python3 -m pip install {source}")
+        assert (
+            return_code == 0
+        ), f"Failed to install repo-policy-compliance from source, {stdout} {stderr}"
 
 
 async def remove_runner_bin(unit: Unit) -> None:
@@ -96,7 +101,7 @@ async def remove_runner_bin(unit: Unit) -> None:
     await run_in_unit(unit, f"rm {RunnerManager.runner_bin_path}")
 
     # No file should exists under with the filename.
-    return_code, _ = await run_in_unit(unit, f"test -f {RunnerManager.runner_bin_path}")
+    return_code, _, _ = await run_in_unit(unit, f"test -f {RunnerManager.runner_bin_path}")
     assert return_code != 0
 
 
@@ -117,7 +122,7 @@ async def assert_resource_lxd_profile(unit: Unit, configs: dict[str, Any]) -> No
     resource_profile_name = Runner._get_resource_profile_name(cpu, mem, disk)
 
     # Verify the profile exists.
-    return_code, stdout = await run_in_unit(unit, "lxc profile list --format json")
+    return_code, stdout, _ = await run_in_unit(unit, "lxc profile list --format json")
     assert return_code == 0
     assert stdout is not None
     profiles = json.loads(stdout)
@@ -125,7 +130,7 @@ async def assert_resource_lxd_profile(unit: Unit, configs: dict[str, Any]) -> No
     assert resource_profile_name in profile_names
 
     # Verify the profile contains the correct resource settings.
-    return_code, stdout = await run_in_unit(unit, f"lxc profile show {resource_profile_name}")
+    return_code, stdout, _ = await run_in_unit(unit, f"lxc profile show {resource_profile_name}")
     assert return_code == 0
     assert stdout is not None
     profile_content = yaml.safe_load(stdout)
@@ -143,7 +148,7 @@ async def get_runner_names(unit: Unit) -> tuple[str, ...]:
     Returns:
         Tuple of runner names.
     """
-    return_code, stdout = await run_in_unit(unit, "lxc list --format json")
+    return_code, stdout, _ = await run_in_unit(unit, "lxc list --format json")
 
     assert return_code == 0
     assert stdout is not None
@@ -164,7 +169,7 @@ async def wait_till_num_of_runners(unit: Unit, num: int) -> None:
         AssertionError: Correct number of runners is not found within timeout
             limit.
     """
-    return_code, stdout = await run_in_unit(unit, "lxc list --format json")
+    return_code, stdout, _ = await run_in_unit(unit, "lxc list --format json")
     assert return_code == 0
     assert stdout is not None
 
@@ -174,7 +179,7 @@ async def wait_till_num_of_runners(unit: Unit, num: int) -> None:
     ), f"Current number of runners: {len(lxc_instance)} Expected number of runner: {num}"
 
     for instance in lxc_instance:
-        return_code, stdout = await run_in_unit(unit, f"lxc exec {instance['name']} -- ps aux")
+        return_code, stdout, _ = await run_in_unit(unit, f"lxc exec {instance['name']} -- ps aux")
         assert return_code == 0
 
         assert stdout is not None
@@ -192,7 +197,9 @@ def on_juju_2() -> bool:
     return not hasattr(juju.version, "SUPPORTED_MAJOR_VERSION")
 
 
-async def run_in_unit(unit: Unit, command: str, timeout=None) -> tuple[int, str | None]:
+async def run_in_unit(
+    unit: Unit, command: str, timeout=None
+) -> tuple[int, str | None, str | None]:
     """Run command in juju unit.
 
     Compatible with juju 3 and juju 2.
@@ -203,16 +210,24 @@ async def run_in_unit(unit: Unit, command: str, timeout=None) -> tuple[int, str
         timeout: Amount of time to wait for the execution.
 
     Returns:
-        Tuple of return code and stdout.
+        Tuple of return code, stdout and stderr.
     """
-    action = await unit.run(command, timeout)
+    action: Action = await unit.run(command, timeout)
 
     # For compatibility with juju 2.
     if on_juju_2():
-        return (int(action.results["Code"]), action.results.get("Stdout", None))
+        return (
+            int(action.results["Code"]),
+            action.results.get("Stdout", None),
+            action.results.get("Stderr", None),
+        )
 
     await action.wait()
-    return (action.results["return-code"], action.results.get("stdout", None))
+    return (
+        action.results["return-code"],
+        action.results.get("stdout", None),
+        action.results.get("stderr", None),
+    )
 
 
 async def run_in_lxd_instance(
@@ -222,7 +237,7 @@ async def run_in_lxd_instance(
     env: dict[str, str] | None = None,
     cwd: str | None = None,
     timeout: int | None = None,
-) -> tuple[int, str | None]:
+) -> tuple[int, str | None, str | None]:
     """Run command in LXD instance of a juju unit.
 
     Args:
@@ -266,7 +281,7 @@ async def start_test_http_server(unit: Unit, port: int):
 
     # Test the HTTP server
     for _ in range(10):
-        return_code, stdout = await run_in_unit(unit, f"curl http://localhost:{port}")
+        return_code, stdout, _ = await run_in_unit(unit, f"curl http://localhost:{port}")
         if return_code == 0 and stdout:
             break
         await sleep(3)
diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py
index 379c8d1ae..a0941c7ca 100644
--- a/tests/integration/test_charm_metrics_failure.py
+++ b/tests/integration/test_charm_metrics_failure.py
@@ -115,8 +115,8 @@ async def test_charm_issues_metrics_for_abnormal_termination(
     # Make the runner terminate abnormally by killing run.sh
     runner_name = await get_runner_name(unit)
     kill_run_sh_cmd = "pkill -9 run.sh"
-    ret_code, _ = await run_in_lxd_instance(unit, runner_name, kill_run_sh_cmd)
-    assert ret_code == 0, "Failed to kill run.sh"
+    ret_code, stdout, stderr = await run_in_lxd_instance(unit, runner_name, kill_run_sh_cmd)
+    assert ret_code == 0, f"Failed to kill run.sh, {stdout} {stderr}"
 
     # Cancel workflow and wait that the runner is marked offline
     # to avoid errors during reconciliation.
@@ -151,19 +151,21 @@ async def test_charm_retrieves_logs_from_unhealthy_runners(
     runner_name = await get_runner_name(unit)
 
     kill_start_sh_cmd = "pkill -9 start.sh"
-    ret_code, _ = await run_in_lxd_instance(unit, runner_name, kill_start_sh_cmd)
-    assert ret_code == 0, "Failed to kill start.sh"
+    ret_code, stdout, stderr = await run_in_lxd_instance(unit, runner_name, kill_start_sh_cmd)
+    assert ret_code == 0, f"Failed to kill start.sh, {stdout} {stderr}"
 
     # Set the number of virtual machines to 0 to avoid to speedup reconciliation.
     await app.set_config({"virtual-machines": "0"})
     await reconcile(app=app, model=model)
 
-    ret_code, stdout = await run_in_unit(unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}")
-    assert ret_code == 0, "Failed to list crashed runner logs"
+    ret_code, stdout, stderr = await run_in_unit(
+        unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}"
+    )
+    assert ret_code == 0, f"Failed to list crashed runner logs {stdout} {stderr}"
     assert stdout
     assert runner_name in stdout, "Failed to find crashed runner log"
 
-    ret_code, stdout = await run_in_unit(
+    ret_code, stdout, _ = await run_in_unit(
         unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}/{runner_name}"
     )
     assert ret_code == 0, "Failed to list crashed runner log"
diff --git a/tests/integration/test_charm_one_runner.py b/tests/integration/test_charm_one_runner.py
index eb305b25d..33a11270c 100644
--- a/tests/integration/test_charm_one_runner.py
+++ b/tests/integration/test_charm_one_runner.py
@@ -53,12 +53,12 @@ async def test_network_access(app: Application) -> None:
     names = await get_runner_names(unit)
     assert names
 
-    return_code, stdout = await run_in_unit(unit, "lxc network get lxdbr0 ipv4.address")
-    assert return_code == 0
+    return_code, stdout, stderr = await run_in_unit(unit, "lxc network get lxdbr0 ipv4.address")
+    assert return_code == 0, f"Failed to get network address {stdout} {stderr}"
     assert stdout is not None
     host_ip, _ = stdout.split("/", 1)
 
-    return_code, stdout = await run_in_lxd_instance(
+    return_code, stdout, _ = await run_in_lxd_instance(
         unit, names[0], f"curl http://{host_ip}:{port}"
     )
 
@@ -159,11 +159,13 @@ async def test_token_config_changed(model: Model, app: Application, token_alt: s
     await app.set_config({"token": token_alt})
     await model.wait_for_idle(status=ACTIVE, timeout=30 * 60)
 
-    return_code, stdout = await run_in_unit(
+    return_code, stdout, stderr = await run_in_unit(
         unit, "cat /etc/systemd/system/repo-policy-compliance.service"
     )
 
-    assert return_code == 0
+    assert (
+        return_code == 0
+    ), f"Failed to get repo-policy-compliance unit file contents {stdout} {stderr}"
     assert stdout is not None
     assert f"GITHUB_TOKEN={token_alt}" in stdout
 
@@ -193,8 +195,10 @@ async def test_reconcile_runners_with_lxd_storage_pool_failure(
     await reconcile(app=app, model=model)
     await wait_till_num_of_runners(unit, 0)
 
-    exit_code, _ = await run_in_unit(unit, f"rm -rf {GithubRunnerCharm.ram_pool_path}/*")
-    assert exit_code == 0
+    exit_code, stdout, stderr = await run_in_unit(
+        unit, f"rm -rf {GithubRunnerCharm.ram_pool_path}/*"
+    )
+    assert exit_code == 0, f"Failed to delete ram pool {stdout} {stderr}"
 
     # 2.
     await app.set_config({"virtual-machines": "1"})
@@ -267,10 +271,10 @@ async def test_disabled_apt_daily_upgrades(model: Model, app: Application) -> No
     names = await get_runner_names(unit)
     assert names, "LXD runners not ready"
 
-    ret_code, stdout = await run_in_lxd_instance(
+    ret_code, stdout, stderr = await run_in_lxd_instance(
         unit, names[0], "sudo systemctl list-units --no-pager"
     )
-    assert ret_code == 0, "Failed to list systemd units"
+    assert ret_code == 0, f"Failed to list systemd units {stdout} {stderr}"
     assert stdout, "No units listed in stdout"
 
     assert "apt-daily" not in stdout  # this also checks for apt-daily-upgrade service
diff --git a/tests/integration/test_charm_scheduled_events.py b/tests/integration/test_charm_scheduled_events.py
index 29265d35e..3fa5c9a97 100644
--- a/tests/integration/test_charm_scheduled_events.py
+++ b/tests/integration/test_charm_scheduled_events.py
@@ -44,13 +44,15 @@ async def test_update_interval(model: Model, app_scheduled_events: Application)
     unit = app_scheduled_events.units[0]
     assert await check_runner_binary_exists(unit)
 
-    await run_in_unit(unit, f"rm -f {RunnerManager.runner_bin_path}")
+    ret_code, stdout, stderr = await run_in_unit(unit, f"rm -f {RunnerManager.runner_bin_path}")
+    assert ret_code == 0, f"Failed to remove runner binary {stdout} {stderr}"
     assert not await check_runner_binary_exists(unit)
 
     runner_names = await get_runner_names(unit)
     assert len(runner_names) == 1
     runner_name = runner_names[0]
-    await run_in_unit(unit, f"lxc stop --force {runner_name}")
+    ret_code, stdout, stderr = await run_in_unit(unit, f"lxc stop --force {runner_name}")
+    assert ret_code == 0, f"Failed to stop lxd instance, {stdout} {stderr}"
     await wait_till_num_of_runners(unit, 0)
 
     await sleep(10 * 60)
diff --git a/tests/integration/test_charm_with_proxy.py b/tests/integration/test_charm_with_proxy.py
index bef653c34..88639aa56 100644
--- a/tests/integration/test_charm_with_proxy.py
+++ b/tests/integration/test_charm_with_proxy.py
@@ -231,8 +231,8 @@ async def _assert_proxy_vars_in_file(unit: Unit, runner_name: str, file_path: st
         file_path: The path to the file to check for proxy environment variables.
         not_set: Whether the proxy environment variables should be set or not.
     """
-    return_code, stdout = await run_in_lxd_instance(unit, runner_name, f"cat {file_path}")
-    assert return_code == 0, "Failed to read file"
+    return_code, stdout, stderr = await run_in_lxd_instance(unit, runner_name, f"cat {file_path}")
+    assert return_code == 0, f"Failed to read file {stdout} {stderr}"
     assert stdout, "File is empty"
     _assert_proxy_var_in(stdout, not_in=not_set)
 
@@ -245,7 +245,7 @@ async def _assert_docker_proxy_vars(unit: Unit, runner_name: str, not_set=False)
         runner_name: The name of the runner.
         not_set: Whether the proxy environment variables should be set or not.
     """
-    return_code, _ = await run_in_lxd_instance(
+    return_code, _, _ = await run_in_lxd_instance(
         unit, runner_name, "docker run --rm alpine sh -c 'env | grep -i  _PROXY'"
     )
     assert return_code == (1 if not_set else 0)
@@ -296,14 +296,16 @@ async def _get_aproxy_logs(unit: Unit, runner_name: str) -> Optional[str]:
     Returns:
         The aproxy logs if existent, otherwise None.
     """
-    return_code, stdout = await run_in_lxd_instance(
+    return_code, stdout, stderr = await run_in_lxd_instance(
         unit, runner_name, "snap logs aproxy.aproxy -n=all"
     )
-    assert return_code == 0, "Failed to get aproxy logs"
+    assert return_code == 0, f"Failed to get aproxy logs {stdout} {stderr}"
     return stdout
 
 
-async def _curl_as_ubuntu_user(unit: Unit, runner_name: str, url: str) -> tuple[int, str | None]:
+async def _curl_as_ubuntu_user(
+    unit: Unit, runner_name: str, url: str
+) -> tuple[int, str | None, str | None]:
     """Run curl as a logged in ubuntu user.
 
     This should simulate the bevahiour of a curl inside the runner with environment variables set.
@@ -314,7 +316,7 @@ async def _curl_as_ubuntu_user(unit: Unit, runner_name: str, url: str) -> tuple[
         url: The URL to curl.
 
     Returns:
-        The return code and stdout of the curl command.
+        The return code, stdout, stderr of the curl command.
     """
     return await run_in_lxd_instance(
         unit,
@@ -350,20 +352,22 @@ async def test_usage_of_aproxy(model: Model, app: Application, proxy_logs_filepa
     _clear_tinyproxy_log(proxy_logs_filepath)
 
     # 1. URL with standard port, should succeed, gets intercepted by aproxy
-    return_code, stdout = await _curl_as_ubuntu_user(unit, runner_name, "http://canonical.com")
+    return_code, stdout, stderr = await _curl_as_ubuntu_user(
+        unit, runner_name, "http://canonical.com"
+    )
     assert (
         return_code == 0
-    ), f"Expected successful connection to http://canonical.com. Error msg: {stdout}"
+    ), f"Expected successful connection to http://canonical.com. Error msg: {stdout} {stderr}"
 
     # 2. URL with non-standard port, should fail, request does not get intercepted by aproxy
-    return_code, stdout = await _curl_as_ubuntu_user(
+    return_code, stdout, stderr = await _curl_as_ubuntu_user(
         unit,
         runner_name,
         f"http://canonical.com:{NON_STANDARD_PORT}",
     )
     assert return_code == 7, (
         f"Expected cannot connect error for http://canonical.com:{NON_STANDARD_PORT}. "
-        f"Error msg: {stdout}"
+        f"Error msg: {stdout} {stderr}"
     )
 
     aproxy_logs = await _get_aproxy_logs(unit, runner_name)
@@ -407,10 +411,12 @@ async def test_use_proxy_without_aproxy(
     _clear_tinyproxy_log(proxy_logs_filepath)
 
     # 1. URL with standard port, should succeed
-    return_code, stdout = await _curl_as_ubuntu_user(unit, runner_name, "http://canonical.com")
+    return_code, stdout, stderr = await _curl_as_ubuntu_user(
+        unit, runner_name, "http://canonical.com"
+    )
     assert (
         return_code == 0
-    ), f"Expected successful connection to http://canonical.com. Error msg: {stdout}"
+    ), f"Expected successful connection to http://canonical.com. Error msg: {stdout} {stderr}"
 
     # 2. URL with non-standard port, should return an error message by the proxy like this:
     #
@@ -423,14 +429,14 @@ async def test_use_proxy_without_aproxy(
     # <p><em>Generated by tinyproxy version 1.11.0.</em></p>
     # </body>
     # </html>
-    return_code, stdout = await _curl_as_ubuntu_user(
+    return_code, stdout, stderr = await _curl_as_ubuntu_user(
         unit,
         runner_name,
         f"http://canonical.com:{NON_STANDARD_PORT}",
     )
     assert return_code == 0, (
         f"Expected error response from proxy for http://canonical.com:{NON_STANDARD_PORT}. "
-        f"Error msg: {stdout}"
+        f"Error msg: {stdout} {stderr}"
     )
 
     proxy_logs = proxy_logs_filepath.read_text(encoding="utf-8")
diff --git a/tests/integration/test_self_hosted_runner.py b/tests/integration/test_self_hosted_runner.py
index bf76ac68d..95ef034a5 100644
--- a/tests/integration/test_self_hosted_runner.py
+++ b/tests/integration/test_self_hosted_runner.py
@@ -57,10 +57,10 @@ async def test_dispatch_workflow_with_dockerhub_mirror(
 
     runner_to_be_used = names[0]
 
-    return_code, stdout = await run_in_lxd_instance(
+    return_code, stdout, stderr = await run_in_lxd_instance(
         unit, runner_to_be_used, "cat /etc/docker/daemon.json"
     )
-    assert return_code == 0
+    assert return_code == 0, f"Failed to get docker daemon contents, {stdout} {stderr}"
     assert stdout is not None
     assert "registry-mirrors" in stdout
     assert fake_url in stdout

From aed2b40aafe0c335dfd1e704dd3fdd4ad068f0d9 Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Tue, 26 Mar 2024 23:06:19 +0800
Subject: [PATCH 06/19] log apt source list error

---
 src-docs/runner_manager.py.md |  2 +-
 src/runner_manager.py         | 12 ++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src-docs/runner_manager.py.md b/src-docs/runner_manager.py.md
index 1b4c074bd..f9cb44f9b 100644
--- a/src-docs/runner_manager.py.md
+++ b/src-docs/runner_manager.py.md
@@ -175,7 +175,7 @@ Bring runners in line with target.
 
 ---
 
-<a href="../src/runner_manager.py#L751"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
+<a href="../src/runner_manager.py#L759"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 
 ### <kbd>function</kbd> `schedule_build_runner_image`
 
diff --git a/src/runner_manager.py b/src/runner_manager.py
index 8a8992b1c..7e3fafa96 100644
--- a/src/runner_manager.py
+++ b/src/runner_manager.py
@@ -25,7 +25,7 @@
 import runner_metrics
 import shared_fs
 from charm_state import VirtualMachineResources
-from errors import IssueMetricEventError, RunnerBinaryError, RunnerCreateError
+from errors import IssueMetricEventError, RunnerBinaryError, RunnerCreateError, SubprocessError
 from github_client import GithubClient
 from github_type import RunnerApplication, SelfHostedRunner
 from lxd import LxdClient, LxdInstance
@@ -746,7 +746,15 @@ def build_runner_image(self) -> None:
         Raises:
             LxdError: Unable to build the LXD image.
         """
-        execute_command(self._build_image_command())
+        try:
+            execute_command(self._build_image_command())
+        # This is for debugging common error in test with build image.
+        # This should be removed before merging.
+        except SubprocessError as exc:
+            if "Type 'ubuntu' is not known on line 43 in source lis" in str(exc):
+                stdout, ret_code = execute_command(["cat", "/etc/apt/sources.list"])
+                logger.critical("Failed to execute build image command: %s %s", stdout, ret_code)
+            raise
 
     def schedule_build_runner_image(self) -> None:
         """Install cron job for building runner image."""

From 5333de0e0238b99c8c6e8f770323d3f4a93eaf1a Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Wed, 27 Mar 2024 02:46:27 +0800
Subject: [PATCH 07/19] log before error

---
 src-docs/runner_manager.py.md | 2 +-
 src/runner_manager.py         | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src-docs/runner_manager.py.md b/src-docs/runner_manager.py.md
index f9cb44f9b..cf390d14b 100644
--- a/src-docs/runner_manager.py.md
+++ b/src-docs/runner_manager.py.md
@@ -175,7 +175,7 @@ Bring runners in line with target.
 
 ---
 
-<a href="../src/runner_manager.py#L759"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
+<a href="../src/runner_manager.py#L761"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 
 ### <kbd>function</kbd> `schedule_build_runner_image`
 
diff --git a/src/runner_manager.py b/src/runner_manager.py
index 7e3fafa96..d4710c4ca 100644
--- a/src/runner_manager.py
+++ b/src/runner_manager.py
@@ -747,11 +747,13 @@ def build_runner_image(self) -> None:
             LxdError: Unable to build the LXD image.
         """
         try:
+            stdout, ret_code = execute_command(["cat", "/etc/apt/sources.list"])
+            logger.error("Before build image command: %s %s", stdout, ret_code)
             execute_command(self._build_image_command())
         # This is for debugging common error in test with build image.
         # This should be removed before merging.
         except SubprocessError as exc:
-            if "Type 'ubuntu' is not known on line 43 in source lis" in str(exc):
+            if "Type 'ubuntu' is not known on line 43 in source list" in str(exc):
                 stdout, ret_code = execute_command(["cat", "/etc/apt/sources.list"])
                 logger.critical("Failed to execute build image command: %s %s", stdout, ret_code)
             raise

From 56c8261914c9089e40585a0ee68b250550597102 Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Wed, 27 Mar 2024 02:48:35 +0800
Subject: [PATCH 08/19] log bulider sources.list

---
 src/runner_manager.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runner_manager.py b/src/runner_manager.py
index d4710c4ca..ab376aa23 100644
--- a/src/runner_manager.py
+++ b/src/runner_manager.py
@@ -747,14 +747,14 @@ def build_runner_image(self) -> None:
             LxdError: Unable to build the LXD image.
         """
         try:
-            stdout, ret_code = execute_command(["cat", "/etc/apt/sources.list"])
-            logger.error("Before build image command: %s %s", stdout, ret_code)
             execute_command(self._build_image_command())
         # This is for debugging common error in test with build image.
         # This should be removed before merging.
         except SubprocessError as exc:
             if "Type 'ubuntu' is not known on line 43 in source list" in str(exc):
-                stdout, ret_code = execute_command(["cat", "/etc/apt/sources.list"])
+                stdout, ret_code = execute_command(
+                    ["/snap/bin/lxc", "exec", "builder", "--", "cat /etc/apt/sources.list"]
+                )
                 logger.critical("Failed to execute build image command: %s %s", stdout, ret_code)
             raise
 

From edebfe394bc476d681150f17816dbdb5e596314b Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Wed, 27 Mar 2024 10:02:44 +0800
Subject: [PATCH 09/19] apt exec command fix

---
 src/runner_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runner_manager.py b/src/runner_manager.py
index ab376aa23..119dcade6 100644
--- a/src/runner_manager.py
+++ b/src/runner_manager.py
@@ -753,7 +753,7 @@ def build_runner_image(self) -> None:
         except SubprocessError as exc:
             if "Type 'ubuntu' is not known on line 43 in source list" in str(exc):
                 stdout, ret_code = execute_command(
-                    ["/snap/bin/lxc", "exec", "builder", "--", "cat /etc/apt/sources.list"]
+                    ["/snap/bin/lxc", "exec", "builder", "cat", "/etc/apt/sources.list"]
                 )
                 logger.critical("Failed to execute build image command: %s %s", stdout, ret_code)
             raise

From 5c01e3db89c8d6d882411e6e06288c87dd91554a Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Wed, 27 Mar 2024 11:22:06 +0800
Subject: [PATCH 10/19] add tmate debug

---
 .github/workflows/integration_test.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml
index 0b07f17f6..821d873c6 100644
--- a/.github/workflows/integration_test.yaml
+++ b/.github/workflows/integration_test.yaml
@@ -16,6 +16,8 @@ jobs:
       provider: lxd
       test-tox-env: integration-juju2.9
       modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]'
+      tmate-debug: true
+      tmate-timeout: 90
   integration-tests:
     name: Integration test with juju 3.1
     uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@fix/upload-artifact
@@ -26,6 +28,8 @@ jobs:
       provider: lxd
       test-tox-env: integration-juju3.1
       modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]'
+      tmate-debug: true
+      tmate-timeout: 90
   # openstack tests use microstack, whose setup is kind of special
   # - due to the huge resource requirements, we use self-hosted runners for these tests
   # - microstack requires juju 3.2 and microk8s 1.26

From 79debedc50851b612aa5f9d351fe54e989aa9526 Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Wed, 27 Mar 2024 15:35:13 +0800
Subject: [PATCH 11/19] retry on assertion error

---
 tests/integration/helpers.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py
index 04f24bcbd..34e6f52e8 100644
--- a/tests/integration/helpers.py
+++ b/tests/integration/helpers.py
@@ -5,6 +5,7 @@
 
 import inspect
 import json
+import logging
 import subprocess
 import time
 import typing
@@ -40,6 +41,8 @@
 
 DEFAULT_RUNNER_CONSTRAINTS = {"root-disk": 15}
 
+logger = logging.getLogger(__name__)
+
 
 async def check_runner_binary_exists(unit: Unit) -> bool:
     """Checks if runner binary exists in the charm.
@@ -157,7 +160,7 @@ async def get_runner_names(unit: Unit) -> tuple[str, ...]:
     return tuple(runner["name"] for runner in lxc_instance)
 
 
-@retry(tries=30, delay=30)
+@retry(exception=AssertionError, tries=30, delay=30, local_logger=logger)
 async def wait_till_num_of_runners(unit: Unit, num: int) -> None:
     """Wait and check the number of runners.
 

From a6e3be5dc33f279a99069ba94c11513a60f8bce2 Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Wed, 27 Mar 2024 20:42:26 +0800
Subject: [PATCH 12/19] replace retry

---
 tests/integration/helpers.py | 41 ++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py
index 34e6f52e8..3e8f640dd 100644
--- a/tests/integration/helpers.py
+++ b/tests/integration/helpers.py
@@ -31,7 +31,6 @@
 from runner import Runner
 from runner_manager import RunnerManager
 from tests.status_name import ACTIVE
-from utilities import retry
 
 DISPATCH_TEST_WORKFLOW_FILENAME = "workflow_dispatch_test.yaml"
 DISPATCH_CRASH_TEST_WORKFLOW_FILENAME = "workflow_dispatch_crash_test.yaml"
@@ -160,8 +159,7 @@ async def get_runner_names(unit: Unit) -> tuple[str, ...]:
     return tuple(runner["name"] for runner in lxc_instance)
 
 
-@retry(exception=AssertionError, tries=30, delay=30, local_logger=logger)
-async def wait_till_num_of_runners(unit: Unit, num: int) -> None:
+async def wait_till_num_of_runners(unit: Unit, num: int, timeout: int = 10 * 60) -> None:
     """Wait and check the number of runners.
 
     Args:
@@ -172,16 +170,37 @@ async def wait_till_num_of_runners(unit: Unit, num: int) -> None:
         AssertionError: Correct number of runners is not found within timeout
             limit.
     """
-    return_code, stdout, _ = await run_in_unit(unit, "lxc list --format json")
-    assert return_code == 0
-    assert stdout is not None
 
-    lxc_instance = json.loads(stdout)
-    assert (
-        len(lxc_instance) == num
-    ), f"Current number of runners: {len(lxc_instance)} Expected number of runner: {num}"
+    async def get_lxc_instances() -> None | list[dict]:
+        """Get lxc instances list info.
+
+        Returns:
+            List of lxc instance dictionaries, None if failed to get list.
+        """
+        return_code, stdout, _ = await run_in_unit(unit, "lxc list --format json")
+        if return_code != 0 or not stdout:
+            logger.error("Failed to run lxc list, %s", return_code)
+            return None
+        return json.loads(stdout)
+
+    async def is_desired_num_runners():
+        """Return whether there are desired number of lxc instances running.
+
+        Returns:
+            Whether the desired number of lxc runners have been reached.
+        """
+        lxc_instances = await get_lxc_instances()
+        if lxc_instances is None:
+            return False
+        return len(lxc_instances) == num
+
+    await wait_for(is_desired_num_runners, timeout=timeout, check_interval=30)
+
+    instances = await get_lxc_instances()
+    if not instances:
+        return
 
-    for instance in lxc_instance:
+    for instance in instances:
         return_code, stdout, _ = await run_in_unit(unit, f"lxc exec {instance['name']} -- ps aux")
         assert return_code == 0
 

From 197b28c7609631874ecc8e8ade4a8d32309c7989 Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Wed, 27 Mar 2024 22:01:50 +0800
Subject: [PATCH 13/19] filter builder from runner

---
 tests/integration/helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py
index 3e8f640dd..e8b2c2a17 100644
--- a/tests/integration/helpers.py
+++ b/tests/integration/helpers.py
@@ -156,7 +156,7 @@ async def get_runner_names(unit: Unit) -> tuple[str, ...]:
     assert stdout is not None
 
     lxc_instance: list[dict[str, str]] = json.loads(stdout)
-    return tuple(runner["name"] for runner in lxc_instance)
+    return tuple(runner["name"] for runner in lxc_instance if runner["name"] != "builder")
 
 
 async def wait_till_num_of_runners(unit: Unit, num: int, timeout: int = 10 * 60) -> None:

From 475da6b8b66a1ce5a4b480188857e23219bc5b22 Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Wed, 27 Mar 2024 22:08:43 +0800
Subject: [PATCH 14/19] debug source.list bug

---
 src-docs/runner_manager.py.md | 2 +-
 src/runner_manager.py         | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src-docs/runner_manager.py.md b/src-docs/runner_manager.py.md
index cf390d14b..bba70e97f 100644
--- a/src-docs/runner_manager.py.md
+++ b/src-docs/runner_manager.py.md
@@ -175,7 +175,7 @@ Bring runners in line with target.
 
 ---
 
-<a href="../src/runner_manager.py#L761"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
+<a href="../src/runner_manager.py#L763"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 
 ### <kbd>function</kbd> `schedule_build_runner_image`
 
diff --git a/src/runner_manager.py b/src/runner_manager.py
index 119dcade6..975ccf3dd 100644
--- a/src/runner_manager.py
+++ b/src/runner_manager.py
@@ -747,14 +747,16 @@ def build_runner_image(self) -> None:
             LxdError: Unable to build the LXD image.
         """
         try:
+            # This is for debugging common error in test with build image.
+            # This should be removed before merging.
+            stdout, ret_code = execute_command(
+                ["/snap/bin/lxc", "exec", "builder", "cat", "/etc/apt/sources.list"]
+            )
             execute_command(self._build_image_command())
         # This is for debugging common error in test with build image.
         # This should be removed before merging.
         except SubprocessError as exc:
             if "Type 'ubuntu' is not known on line 43 in source list" in str(exc):
-                stdout, ret_code = execute_command(
-                    ["/snap/bin/lxc", "exec", "builder", "cat", "/etc/apt/sources.list"]
-                )
                 logger.critical("Failed to execute build image command: %s %s", stdout, ret_code)
             raise
 

From b5d487d291764e73c59f6825fa769a089f9d575c Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Thu, 28 Mar 2024 07:02:05 +0800
Subject: [PATCH 15/19] revert log

---
 src-docs/runner_manager.py.md | 2 +-
 src/runner_manager.py         | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src-docs/runner_manager.py.md b/src-docs/runner_manager.py.md
index bba70e97f..cf390d14b 100644
--- a/src-docs/runner_manager.py.md
+++ b/src-docs/runner_manager.py.md
@@ -175,7 +175,7 @@ Bring runners in line with target.
 
 ---
 
-<a href="../src/runner_manager.py#L763"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
+<a href="../src/runner_manager.py#L761"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 
 ### <kbd>function</kbd> `schedule_build_runner_image`
 
diff --git a/src/runner_manager.py b/src/runner_manager.py
index 975ccf3dd..119dcade6 100644
--- a/src/runner_manager.py
+++ b/src/runner_manager.py
@@ -747,16 +747,14 @@ def build_runner_image(self) -> None:
             LxdError: Unable to build the LXD image.
         """
         try:
-            # This is for debugging common error in test with build image.
-            # This should be removed before merging.
-            stdout, ret_code = execute_command(
-                ["/snap/bin/lxc", "exec", "builder", "cat", "/etc/apt/sources.list"]
-            )
             execute_command(self._build_image_command())
         # This is for debugging common error in test with build image.
         # This should be removed before merging.
         except SubprocessError as exc:
             if "Type 'ubuntu' is not known on line 43 in source list" in str(exc):
+                stdout, ret_code = execute_command(
+                    ["/snap/bin/lxc", "exec", "builder", "cat", "/etc/apt/sources.list"]
+                )
                 logger.critical("Failed to execute build image command: %s %s", stdout, ret_code)
             raise
 

From 23a139b65cf6fcf7748c0b6122b0954f1c4cc1ee Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Tue, 2 Apr 2024 13:29:30 +0800
Subject: [PATCH 16/19] revert debug commits

---
 .github/workflows/integration_test.yaml |  8 ++------
 src-docs/runner_manager.py.md           |  2 +-
 src/runner_manager.py                   | 14 ++------------
 3 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml
index 821d873c6..046e4d921 100644
--- a/.github/workflows/integration_test.yaml
+++ b/.github/workflows/integration_test.yaml
@@ -16,11 +16,9 @@ jobs:
       provider: lxd
       test-tox-env: integration-juju2.9
       modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]'
-      tmate-debug: true
-      tmate-timeout: 90
   integration-tests:
     name: Integration test with juju 3.1
-    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@fix/upload-artifact
+    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
     secrets: inherit
     with:
       juju-channel: 3.1/stable
@@ -28,8 +26,6 @@ jobs:
       provider: lxd
       test-tox-env: integration-juju3.1
       modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]'
-      tmate-debug: true
-      tmate-timeout: 90
   # openstack tests use microstack, whose setup is kind of special
   # - due to the huge resource requirements, we use self-hosted runners for these tests
   # - microstack requires juju 3.2 and microk8s 1.26
@@ -37,7 +33,7 @@ jobs:
   # - we need to disable the rbac addon for microk8s, otherwise the setup will fail
   integration-tests-microstack:
     name: Integration test using microstack
-    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@fix/upload-artifact
+    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
     secrets: inherit
     with:
       juju-channel: 3.2/stable
diff --git a/src-docs/runner_manager.py.md b/src-docs/runner_manager.py.md
index cf390d14b..1b4c074bd 100644
--- a/src-docs/runner_manager.py.md
+++ b/src-docs/runner_manager.py.md
@@ -175,7 +175,7 @@ Bring runners in line with target.
 
 ---
 
-<a href="../src/runner_manager.py#L761"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
+<a href="../src/runner_manager.py#L751"><img align="right" style="float:right;" src="https://img.shields.io/badge/-source-cccccc?style=flat-square"></a>
 
 ### <kbd>function</kbd> `schedule_build_runner_image`
 
diff --git a/src/runner_manager.py b/src/runner_manager.py
index 119dcade6..8a8992b1c 100644
--- a/src/runner_manager.py
+++ b/src/runner_manager.py
@@ -25,7 +25,7 @@
 import runner_metrics
 import shared_fs
 from charm_state import VirtualMachineResources
-from errors import IssueMetricEventError, RunnerBinaryError, RunnerCreateError, SubprocessError
+from errors import IssueMetricEventError, RunnerBinaryError, RunnerCreateError
 from github_client import GithubClient
 from github_type import RunnerApplication, SelfHostedRunner
 from lxd import LxdClient, LxdInstance
@@ -746,17 +746,7 @@ def build_runner_image(self) -> None:
         Raises:
             LxdError: Unable to build the LXD image.
         """
-        try:
-            execute_command(self._build_image_command())
-        # This is for debugging common error in test with build image.
-        # This should be removed before merging.
-        except SubprocessError as exc:
-            if "Type 'ubuntu' is not known on line 43 in source list" in str(exc):
-                stdout, ret_code = execute_command(
-                    ["/snap/bin/lxc", "exec", "builder", "cat", "/etc/apt/sources.list"]
-                )
-                logger.critical("Failed to execute build image command: %s %s", stdout, ret_code)
-            raise
+        execute_command(self._build_image_command())
 
     def schedule_build_runner_image(self) -> None:
         """Install cron job for building runner image."""

From 5bda3fd5ce676c6033287234b806e578e64c57b3 Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Tue, 2 Apr 2024 13:30:04 +0800
Subject: [PATCH 17/19] revert debug commits

---
 .github/workflows/integration_test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml
index 046e4d921..07a1ca231 100644
--- a/.github/workflows/integration_test.yaml
+++ b/.github/workflows/integration_test.yaml
@@ -8,7 +8,7 @@ jobs:
   # INTEGRATION_TEST_ARGS to operator-workflows automatically.
   integration-tests-juju2:
     name: Integration test with juju 2.9
-    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@fix/upload-artifact
+    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
     secrets: inherit
     with:
       juju-channel: 2.9/stable

From a0bf2bcfcd50c4eae6e503301dc2a5c5a95e6881 Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Mon, 8 Apr 2024 11:03:42 +0800
Subject: [PATCH 18/19] add time since param

---
 tests/integration/charm_metrics_helpers.py      | 14 ++++++++++++--
 tests/integration/test_charm_metrics_failure.py |  6 +++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/tests/integration/charm_metrics_helpers.py b/tests/integration/charm_metrics_helpers.py
index c12d92349..55d2f1d18 100644
--- a/tests/integration/charm_metrics_helpers.py
+++ b/tests/integration/charm_metrics_helpers.py
@@ -4,6 +4,7 @@
 """Utilities for charm metrics integration tests."""
 
 
+import datetime
 import json
 import logging
 from time import sleep
@@ -31,19 +32,28 @@
 ]
 
 
-async def wait_for_workflow_to_start(unit: Unit, workflow: Workflow, branch: Branch | None = None):
+async def wait_for_workflow_to_start(
+    unit: Unit, workflow: Workflow, branch: Branch | None = None, started_time: float | None = None
+):
     """Wait for the workflow to start.
 
     Args:
         unit: The unit which contains the runner.
         workflow: The workflow to wait for.
         branch: The branch where the workflow belongs to.
+        started_time: The time in seconds since epoch the job was started.
     """
     runner_name = await get_runner_name(unit)
+    created_at = (
+        None
+        if not started_time
+        # convert to integer since GH API takes up to seconds.
+        else f">={datetime.datetime.fromtimestamp(int(started_time)).isoformat()}"
+    )
 
     def is_runner_log():
         """Return whether a log for given runner exists."""
-        for run in workflow.get_runs(branch=branch):
+        for run in workflow.get_runs(branch=branch, created=created_at):
             jobs = run.jobs()
             if not jobs:
                 return False
diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py
index a0941c7ca..933ebc5d2 100644
--- a/tests/integration/test_charm_metrics_failure.py
+++ b/tests/integration/test_charm_metrics_failure.py
@@ -2,6 +2,7 @@
 #  See LICENSE file for licensing details.
 
 """Integration tests for metrics/logs assuming Github workflow failures or a runner crash."""
+import time
 from typing import AsyncIterator
 
 import pytest
@@ -108,9 +109,12 @@ async def test_charm_issues_metrics_for_abnormal_termination(
     workflow = forked_github_repository.get_workflow(
         id_or_file_name=DISPATCH_CRASH_TEST_WORKFLOW_FILENAME
     )
+    dispatch_time = time.time()
     assert workflow.create_dispatch(forked_github_branch, {"runner": app.name})
 
-    await wait_for_workflow_to_start(unit, workflow, branch=forked_github_branch)
+    await wait_for_workflow_to_start(
+        unit, workflow, branch=forked_github_branch, started_time=dispatch_time
+    )
 
     # Make the runner terminate abnormally by killing run.sh
     runner_name = await get_runner_name(unit)

From 19323f7a112eb4d37a66c97056c0ff5b191187ca Mon Sep 17 00:00:00 2001
From: charlie4284 <yangsoo.yoon@canonical.com>
Date: Mon, 8 Apr 2024 11:04:05 +0800
Subject: [PATCH 19/19] remove redundant exc

---
 src/charm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/charm.py b/src/charm.py
index 0d9e72086..e7531bc81 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -286,8 +286,8 @@ def _ensure_service_health(self) -> None:
         logger.info("Checking health of repo-policy-compliance service")
         try:
             execute_command(["/usr/bin/systemctl", "is-active", "repo-policy-compliance"])
-        except SubprocessError as exc:
-            logger.exception("Found inactive repo-policy-compliance service, %s", exc)
+        except SubprocessError:
+            logger.exception("Found inactive repo-policy-compliance service.")
             execute_command(["/usr/bin/systemctl", "restart", "repo-policy-compliance"])
             logger.info("Restart repo-policy-compliance service")
             raise