From c066df39f3cd794c2573566b41b7dd12c910c03d Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Mon, 25 Mar 2024 13:59:41 +0800 Subject: [PATCH 01/19] github exception fix --- tests/integration/charm_metrics_helpers.py | 42 ++++++++++++------- .../integration/test_charm_metrics_failure.py | 2 +- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/tests/integration/charm_metrics_helpers.py b/tests/integration/charm_metrics_helpers.py index ac4602a0b..aad98742d 100644 --- a/tests/integration/charm_metrics_helpers.py +++ b/tests/integration/charm_metrics_helpers.py @@ -9,15 +9,20 @@ from time import sleep import requests +from github.Branch import Branch +from github.GithubException import GithubException from github.Repository import Repository from github.Workflow import Workflow +from github.WorkflowJob import WorkflowJob from juju.application import Application from juju.unit import Unit from github_type import JobConclusion from metrics import METRICS_LOG_PATH from runner_metrics import PostJobStatus -from tests.integration.helpers import get_runner_name, run_in_unit +from tests.integration.helpers import get_runner_name, run_in_unit, wait_for + +logger = logging.getLogger(__name__) TEST_WORKFLOW_NAMES = [ "Workflow Dispatch Tests", @@ -26,7 +31,7 @@ ] -async def wait_for_workflow_to_start(unit: Unit, workflow: Workflow): +async def wait_for_workflow_to_start(unit: Unit, workflow: Workflow, branch: Branch | None = None): """Wait for the workflow to start. Args: @@ -34,21 +39,28 @@ async def wait_for_workflow_to_start(unit: Unit, workflow: Workflow): workflow: The workflow to wait for. """ runner_name = await get_runner_name(unit) - for _ in range(30): - for run in workflow.get_runs(): + + def is_runner_log(): + """Return whether a log for given runner exists.""" + for run in workflow.get_runs(branch=branch): jobs = run.jobs() - if jobs: - logs_url = jobs[0].logs_url() - logs = requests.get(logs_url).content.decode("utf-8") + if not jobs: + return False + try: + job: WorkflowJob = jobs[0] + logs = requests.get(job.logs_url()).content.decode("utf-8") + except GithubException as exc: + if exc.status == 410: + logger.warning("Transient github error, %s", exc) + return False + if runner_name in logs: + return True + return False - if runner_name in logs: - break - else: - sleep(30) - continue - break - else: - assert False, "Timeout while waiting for the workflow to start" + try: + await wait_for(is_runner_log, timeout=20 * 60, check_interval=30) + except TimeoutError as exc: + raise TimeoutError("Timeout while waiting for the workflow to start") from exc async def clear_metrics_log(unit: Unit) -> None: diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py index 032b29b1e..45777eaa4 100644 --- a/tests/integration/test_charm_metrics_failure.py +++ b/tests/integration/test_charm_metrics_failure.py @@ -110,7 +110,7 @@ async def test_charm_issues_metrics_for_abnormal_termination( ) assert workflow.create_dispatch(forked_github_branch, {"runner": app.name}) - await wait_for_workflow_to_start(unit, workflow) + await wait_for_workflow_to_start(unit, workflow, branch=forked_github_branch) # Make the runner terminate abnormally by killing run.sh runner_name = await get_runner_name(unit) From 9f4e7ec9a3c1ac7534c2d9ad7ddd76527a97de5e Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Tue, 26 Mar 2024 19:25:43 +0800 Subject: [PATCH 02/19] log inactive subprocess error --- src/charm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/charm.py b/src/charm.py index e595d195a..0d9e72086 100755 --- a/src/charm.py +++ b/src/charm.py @@ -286,8 +286,8 @@ def _ensure_service_health(self) -> None: logger.info("Checking health of repo-policy-compliance service") try: execute_command(["/usr/bin/systemctl", "is-active", "repo-policy-compliance"]) - except SubprocessError: - logger.exception("Found inactive repo-policy-compliance service") + except SubprocessError as exc: + logger.exception("Found inactive repo-policy-compliance service, %s", exc) execute_command(["/usr/bin/systemctl", "restart", "repo-policy-compliance"]) logger.info("Restart repo-policy-compliance service") raise From 10e0751d34a50ed3c2c02eb4010d1784282ed9f8 Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Tue, 26 Mar 2024 21:11:40 +0800 Subject: [PATCH 03/19] try upload-artifact fix --- .github/workflows/integration_test.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 07a1ca231..0b07f17f6 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -8,7 +8,7 @@ jobs: # INTEGRATION_TEST_ARGS to operator-workflows automatically. integration-tests-juju2: name: Integration test with juju 2.9 - uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@fix/upload-artifact secrets: inherit with: juju-channel: 2.9/stable @@ -18,7 +18,7 @@ jobs: modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]' integration-tests: name: Integration test with juju 3.1 - uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@fix/upload-artifact secrets: inherit with: juju-channel: 3.1/stable @@ -33,7 +33,7 @@ jobs: # - we need to disable the rbac addon for microk8s, otherwise the setup will fail integration-tests-microstack: name: Integration test using microstack - uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@fix/upload-artifact secrets: inherit with: juju-channel: 3.2/stable From 879c06e296d9f71cd56362bed2ad27e1ffd8fabc Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Tue, 26 Mar 2024 21:17:21 +0800 Subject: [PATCH 04/19] github error on cancel workflow run --- tests/integration/charm_metrics_helpers.py | 23 ++++++++++++------- .../integration/test_charm_metrics_failure.py | 2 +- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/tests/integration/charm_metrics_helpers.py b/tests/integration/charm_metrics_helpers.py index aad98742d..87daba667 100644 --- a/tests/integration/charm_metrics_helpers.py +++ b/tests/integration/charm_metrics_helpers.py @@ -37,6 +37,7 @@ async def wait_for_workflow_to_start(unit: Unit, workflow: Workflow, branch: Bra Args: unit: The unit which contains the runner. workflow: The workflow to wait for. + branch: The branch where the workflow belongs to. """ runner_name = await get_runner_name(unit) @@ -117,23 +118,29 @@ async def get_metrics_log(unit: Unit) -> str: return stdout.strip() -async def cancel_workflow_run(unit: Unit, workflow: Workflow): +async def cancel_workflow_run(unit: Unit, workflow: Workflow, branch: Branch | None = None): """Cancel the workflow run. Args: unit: The unit which contains the runner. workflow: The workflow to cancel the workflow run for. + branch: The branch where the workflow belongs to. """ runner_name = await get_runner_name(unit) - for run in workflow.get_runs(): + for run in workflow.get_runs(branch=branch): jobs = run.jobs() - if jobs: - logs_url = jobs[0].logs_url() - logs = requests.get(logs_url).content.decode("utf-8") - - if runner_name in logs: - run.cancel() + if not jobs: + continue + try: + job: WorkflowJob = jobs[0] + logs = requests.get(job.logs_url()).content.decode("utf-8") + except GithubException as exc: + if exc.status == 410: + logger.warning("Transient github error, %s", exc) + continue + if runner_name in logs: + run.cancel() async def assert_events_after_reconciliation( diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py index 45777eaa4..379c8d1ae 100644 --- a/tests/integration/test_charm_metrics_failure.py +++ b/tests/integration/test_charm_metrics_failure.py @@ -120,7 +120,7 @@ async def test_charm_issues_metrics_for_abnormal_termination( # Cancel workflow and wait that the runner is marked offline # to avoid errors during reconciliation. - await cancel_workflow_run(unit, workflow) + await cancel_workflow_run(unit, workflow, branch=forked_github_branch) await wait_for_runner_to_be_marked_offline(forked_github_repository, runner_name) # Set the number of virtual machines to 0 to speedup reconciliation From 84cbf74f60f9bf418ad0d176121a089e50e90fcc Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Tue, 26 Mar 2024 22:27:56 +0800 Subject: [PATCH 05/19] add stdout/err hints to help debug test failures --- tests/integration/charm_metrics_helpers.py | 12 ++-- tests/integration/helpers.py | 55 ++++++++++++------- .../integration/test_charm_metrics_failure.py | 16 +++--- tests/integration/test_charm_one_runner.py | 22 +++++--- .../test_charm_scheduled_events.py | 6 +- tests/integration/test_charm_with_proxy.py | 36 +++++++----- tests/integration/test_self_hosted_runner.py | 4 +- 7 files changed, 90 insertions(+), 61 deletions(-) diff --git a/tests/integration/charm_metrics_helpers.py b/tests/integration/charm_metrics_helpers.py index 87daba667..c12d92349 100644 --- a/tests/integration/charm_metrics_helpers.py +++ b/tests/integration/charm_metrics_helpers.py @@ -70,11 +70,11 @@ async def clear_metrics_log(unit: Unit) -> None: Args: unit: The unit to clear the metrics log on. """ - retcode, _ = await run_in_unit( + retcode, _, stderr = await run_in_unit( unit=unit, command=f"if [ -f {METRICS_LOG_PATH} ]; then rm {METRICS_LOG_PATH}; fi", ) - assert retcode == 0, "Failed to clear metrics log" + assert retcode == 0, f"Failed to clear metrics log, {stderr}" async def print_loop_device_info(unit: Unit, loop_device: str) -> None: @@ -84,11 +84,11 @@ async def print_loop_device_info(unit: Unit, loop_device: str) -> None: unit: The unit to print the loop device info on. loop_device: The loop device to print the info for. """ - retcode, stdout = await run_in_unit( + retcode, stdout, stderr = await run_in_unit( unit=unit, command="sudo losetup -lJ", ) - assert retcode == 0, f"Failed to get loop devices: {stdout}" + assert retcode == 0, f"Failed to get loop devices: {stdout} {stderr}" assert stdout is not None, "Failed to get loop devices, no stdout message" loop_devices_info = json.loads(stdout) for loop_device_info in loop_devices_info["loopdevices"]: @@ -108,11 +108,11 @@ async def get_metrics_log(unit: Unit) -> str: Returns: The metrics log. """ - retcode, stdout = await run_in_unit( + retcode, stdout, stderr = await run_in_unit( unit=unit, command=f"if [ -f {METRICS_LOG_PATH} ]; then cat {METRICS_LOG_PATH}; else echo ''; fi", ) - assert retcode == 0, f"Failed to get metrics log: {stdout}" + assert retcode == 0, f"Failed to get metrics log: {stdout} {stderr}" assert stdout is not None, "Failed to get metrics log, no stdout message" logging.info("Metrics log: %s", stdout) return stdout.strip() diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index aca334a72..04f24bcbd 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -22,6 +22,7 @@ from github.Workflow import Workflow from github.WorkflowJob import WorkflowJob from github.WorkflowRun import WorkflowRun +from juju.action import Action from juju.application import Application from juju.model import Model from juju.unit import Unit @@ -49,7 +50,7 @@ async def check_runner_binary_exists(unit: Unit) -> bool: Returns: Whether the runner binary file exists in the charm. """ - return_code, _ = await run_in_unit(unit, f"test -f {RunnerManager.runner_bin_path}") + return_code, _, _ = await run_in_unit(unit, f"test -f {RunnerManager.runner_bin_path}") return return_code == 0 @@ -62,10 +63,12 @@ async def get_repo_policy_compliance_pip_info(unit: Unit) -> None | str: Returns: If repo-policy-compliance is installed, returns the pip show output, else returns none. """ - return_code, stdout = await run_in_unit(unit, "python3 -m pip show repo-policy-compliance") + return_code, stdout, stderr = await run_in_unit( + unit, "python3 -m pip show repo-policy-compliance" + ) if return_code == 0: - return stdout + return stdout or stderr return None @@ -77,14 +80,16 @@ async def install_repo_policy_compliance_from_git_source(unit: Unit, source: Non unit: Unit instance to check for the LXD profile. source: The git source to install the package. If none the package is removed. """ - return_code, stdout = await run_in_unit( + return_code, stdout, stderr = await run_in_unit( unit, "python3 -m pip uninstall --yes repo-policy-compliance" ) - assert return_code == 0, f"Failed to uninstall repo-policy-compliance: {stdout}" + assert return_code == 0, f"Failed to uninstall repo-policy-compliance: {stdout} {stderr}" if source: - return_code, _ = await run_in_unit(unit, f"python3 -m pip install {source}") - assert return_code == 0 + return_code, stdout, stderr = await run_in_unit(unit, f"python3 -m pip install {source}") + assert ( + return_code == 0 + ), f"Failed to install repo-policy-compliance from source, {stdout} {stderr}" async def remove_runner_bin(unit: Unit) -> None: @@ -96,7 +101,7 @@ async def remove_runner_bin(unit: Unit) -> None: await run_in_unit(unit, f"rm {RunnerManager.runner_bin_path}") # No file should exists under with the filename. - return_code, _ = await run_in_unit(unit, f"test -f {RunnerManager.runner_bin_path}") + return_code, _, _ = await run_in_unit(unit, f"test -f {RunnerManager.runner_bin_path}") assert return_code != 0 @@ -117,7 +122,7 @@ async def assert_resource_lxd_profile(unit: Unit, configs: dict[str, Any]) -> No resource_profile_name = Runner._get_resource_profile_name(cpu, mem, disk) # Verify the profile exists. - return_code, stdout = await run_in_unit(unit, "lxc profile list --format json") + return_code, stdout, _ = await run_in_unit(unit, "lxc profile list --format json") assert return_code == 0 assert stdout is not None profiles = json.loads(stdout) @@ -125,7 +130,7 @@ async def assert_resource_lxd_profile(unit: Unit, configs: dict[str, Any]) -> No assert resource_profile_name in profile_names # Verify the profile contains the correct resource settings. - return_code, stdout = await run_in_unit(unit, f"lxc profile show {resource_profile_name}") + return_code, stdout, _ = await run_in_unit(unit, f"lxc profile show {resource_profile_name}") assert return_code == 0 assert stdout is not None profile_content = yaml.safe_load(stdout) @@ -143,7 +148,7 @@ async def get_runner_names(unit: Unit) -> tuple[str, ...]: Returns: Tuple of runner names. """ - return_code, stdout = await run_in_unit(unit, "lxc list --format json") + return_code, stdout, _ = await run_in_unit(unit, "lxc list --format json") assert return_code == 0 assert stdout is not None @@ -164,7 +169,7 @@ async def wait_till_num_of_runners(unit: Unit, num: int) -> None: AssertionError: Correct number of runners is not found within timeout limit. """ - return_code, stdout = await run_in_unit(unit, "lxc list --format json") + return_code, stdout, _ = await run_in_unit(unit, "lxc list --format json") assert return_code == 0 assert stdout is not None @@ -174,7 +179,7 @@ async def wait_till_num_of_runners(unit: Unit, num: int) -> None: ), f"Current number of runners: {len(lxc_instance)} Expected number of runner: {num}" for instance in lxc_instance: - return_code, stdout = await run_in_unit(unit, f"lxc exec {instance['name']} -- ps aux") + return_code, stdout, _ = await run_in_unit(unit, f"lxc exec {instance['name']} -- ps aux") assert return_code == 0 assert stdout is not None @@ -192,7 +197,9 @@ def on_juju_2() -> bool: return not hasattr(juju.version, "SUPPORTED_MAJOR_VERSION") -async def run_in_unit(unit: Unit, command: str, timeout=None) -> tuple[int, str | None]: +async def run_in_unit( + unit: Unit, command: str, timeout=None +) -> tuple[int, str | None, str | None]: """Run command in juju unit. Compatible with juju 3 and juju 2. @@ -203,16 +210,24 @@ async def run_in_unit(unit: Unit, command: str, timeout=None) -> tuple[int, str timeout: Amount of time to wait for the execution. Returns: - Tuple of return code and stdout. + Tuple of return code, stdout and stderr. """ - action = await unit.run(command, timeout) + action: Action = await unit.run(command, timeout) # For compatibility with juju 2. if on_juju_2(): - return (int(action.results["Code"]), action.results.get("Stdout", None)) + return ( + int(action.results["Code"]), + action.results.get("Stdout", None), + action.results.get("Stderr", None), + ) await action.wait() - return (action.results["return-code"], action.results.get("stdout", None)) + return ( + action.results["return-code"], + action.results.get("stdout", None), + action.results.get("stderr", None), + ) async def run_in_lxd_instance( @@ -222,7 +237,7 @@ async def run_in_lxd_instance( env: dict[str, str] | None = None, cwd: str | None = None, timeout: int | None = None, -) -> tuple[int, str | None]: +) -> tuple[int, str | None, str | None]: """Run command in LXD instance of a juju unit. Args: @@ -266,7 +281,7 @@ async def start_test_http_server(unit: Unit, port: int): # Test the HTTP server for _ in range(10): - return_code, stdout = await run_in_unit(unit, f"curl http://localhost:{port}") + return_code, stdout, _ = await run_in_unit(unit, f"curl http://localhost:{port}") if return_code == 0 and stdout: break await sleep(3) diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py index 379c8d1ae..a0941c7ca 100644 --- a/tests/integration/test_charm_metrics_failure.py +++ b/tests/integration/test_charm_metrics_failure.py @@ -115,8 +115,8 @@ async def test_charm_issues_metrics_for_abnormal_termination( # Make the runner terminate abnormally by killing run.sh runner_name = await get_runner_name(unit) kill_run_sh_cmd = "pkill -9 run.sh" - ret_code, _ = await run_in_lxd_instance(unit, runner_name, kill_run_sh_cmd) - assert ret_code == 0, "Failed to kill run.sh" + ret_code, stdout, stderr = await run_in_lxd_instance(unit, runner_name, kill_run_sh_cmd) + assert ret_code == 0, f"Failed to kill run.sh, {stdout} {stderr}" # Cancel workflow and wait that the runner is marked offline # to avoid errors during reconciliation. @@ -151,19 +151,21 @@ async def test_charm_retrieves_logs_from_unhealthy_runners( runner_name = await get_runner_name(unit) kill_start_sh_cmd = "pkill -9 start.sh" - ret_code, _ = await run_in_lxd_instance(unit, runner_name, kill_start_sh_cmd) - assert ret_code == 0, "Failed to kill start.sh" + ret_code, stdout, stderr = await run_in_lxd_instance(unit, runner_name, kill_start_sh_cmd) + assert ret_code == 0, f"Failed to kill start.sh, {stdout} {stderr}" # Set the number of virtual machines to 0 to avoid to speedup reconciliation. await app.set_config({"virtual-machines": "0"}) await reconcile(app=app, model=model) - ret_code, stdout = await run_in_unit(unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}") - assert ret_code == 0, "Failed to list crashed runner logs" + ret_code, stdout, stderr = await run_in_unit( + unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}" + ) + assert ret_code == 0, f"Failed to list crashed runner logs {stdout} {stderr}" assert stdout assert runner_name in stdout, "Failed to find crashed runner log" - ret_code, stdout = await run_in_unit( + ret_code, stdout, _ = await run_in_unit( unit, f"ls {runner_logs.CRASHED_RUNNER_LOGS_DIR_PATH}/{runner_name}" ) assert ret_code == 0, "Failed to list crashed runner log" diff --git a/tests/integration/test_charm_one_runner.py b/tests/integration/test_charm_one_runner.py index eb305b25d..33a11270c 100644 --- a/tests/integration/test_charm_one_runner.py +++ b/tests/integration/test_charm_one_runner.py @@ -53,12 +53,12 @@ async def test_network_access(app: Application) -> None: names = await get_runner_names(unit) assert names - return_code, stdout = await run_in_unit(unit, "lxc network get lxdbr0 ipv4.address") - assert return_code == 0 + return_code, stdout, stderr = await run_in_unit(unit, "lxc network get lxdbr0 ipv4.address") + assert return_code == 0, f"Failed to get network address {stdout} {stderr}" assert stdout is not None host_ip, _ = stdout.split("/", 1) - return_code, stdout = await run_in_lxd_instance( + return_code, stdout, _ = await run_in_lxd_instance( unit, names[0], f"curl http://{host_ip}:{port}" ) @@ -159,11 +159,13 @@ async def test_token_config_changed(model: Model, app: Application, token_alt: s await app.set_config({"token": token_alt}) await model.wait_for_idle(status=ACTIVE, timeout=30 * 60) - return_code, stdout = await run_in_unit( + return_code, stdout, stderr = await run_in_unit( unit, "cat /etc/systemd/system/repo-policy-compliance.service" ) - assert return_code == 0 + assert ( + return_code == 0 + ), f"Failed to get repo-policy-compliance unit file contents {stdout} {stderr}" assert stdout is not None assert f"GITHUB_TOKEN={token_alt}" in stdout @@ -193,8 +195,10 @@ async def test_reconcile_runners_with_lxd_storage_pool_failure( await reconcile(app=app, model=model) await wait_till_num_of_runners(unit, 0) - exit_code, _ = await run_in_unit(unit, f"rm -rf {GithubRunnerCharm.ram_pool_path}/*") - assert exit_code == 0 + exit_code, stdout, stderr = await run_in_unit( + unit, f"rm -rf {GithubRunnerCharm.ram_pool_path}/*" + ) + assert exit_code == 0, f"Failed to delete ram pool {stdout} {stderr}" # 2. await app.set_config({"virtual-machines": "1"}) @@ -267,10 +271,10 @@ async def test_disabled_apt_daily_upgrades(model: Model, app: Application) -> No names = await get_runner_names(unit) assert names, "LXD runners not ready" - ret_code, stdout = await run_in_lxd_instance( + ret_code, stdout, stderr = await run_in_lxd_instance( unit, names[0], "sudo systemctl list-units --no-pager" ) - assert ret_code == 0, "Failed to list systemd units" + assert ret_code == 0, f"Failed to list systemd units {stdout} {stderr}" assert stdout, "No units listed in stdout" assert "apt-daily" not in stdout # this also checks for apt-daily-upgrade service diff --git a/tests/integration/test_charm_scheduled_events.py b/tests/integration/test_charm_scheduled_events.py index 29265d35e..3fa5c9a97 100644 --- a/tests/integration/test_charm_scheduled_events.py +++ b/tests/integration/test_charm_scheduled_events.py @@ -44,13 +44,15 @@ async def test_update_interval(model: Model, app_scheduled_events: Application) unit = app_scheduled_events.units[0] assert await check_runner_binary_exists(unit) - await run_in_unit(unit, f"rm -f {RunnerManager.runner_bin_path}") + ret_code, stdout, stderr = await run_in_unit(unit, f"rm -f {RunnerManager.runner_bin_path}") + assert ret_code == 0, f"Failed to remove runner binary {stdout} {stderr}" assert not await check_runner_binary_exists(unit) runner_names = await get_runner_names(unit) assert len(runner_names) == 1 runner_name = runner_names[0] - await run_in_unit(unit, f"lxc stop --force {runner_name}") + ret_code, stdout, stderr = await run_in_unit(unit, f"lxc stop --force {runner_name}") + assert ret_code == 0, f"Failed to stop lxd instance, {stdout} {stderr}" await wait_till_num_of_runners(unit, 0) await sleep(10 * 60) diff --git a/tests/integration/test_charm_with_proxy.py b/tests/integration/test_charm_with_proxy.py index bef653c34..88639aa56 100644 --- a/tests/integration/test_charm_with_proxy.py +++ b/tests/integration/test_charm_with_proxy.py @@ -231,8 +231,8 @@ async def _assert_proxy_vars_in_file(unit: Unit, runner_name: str, file_path: st file_path: The path to the file to check for proxy environment variables. not_set: Whether the proxy environment variables should be set or not. """ - return_code, stdout = await run_in_lxd_instance(unit, runner_name, f"cat {file_path}") - assert return_code == 0, "Failed to read file" + return_code, stdout, stderr = await run_in_lxd_instance(unit, runner_name, f"cat {file_path}") + assert return_code == 0, f"Failed to read file {stdout} {stderr}" assert stdout, "File is empty" _assert_proxy_var_in(stdout, not_in=not_set) @@ -245,7 +245,7 @@ async def _assert_docker_proxy_vars(unit: Unit, runner_name: str, not_set=False) runner_name: The name of the runner. not_set: Whether the proxy environment variables should be set or not. """ - return_code, _ = await run_in_lxd_instance( + return_code, _, _ = await run_in_lxd_instance( unit, runner_name, "docker run --rm alpine sh -c 'env | grep -i _PROXY'" ) assert return_code == (1 if not_set else 0) @@ -296,14 +296,16 @@ async def _get_aproxy_logs(unit: Unit, runner_name: str) -> Optional[str]: Returns: The aproxy logs if existent, otherwise None. """ - return_code, stdout = await run_in_lxd_instance( + return_code, stdout, stderr = await run_in_lxd_instance( unit, runner_name, "snap logs aproxy.aproxy -n=all" ) - assert return_code == 0, "Failed to get aproxy logs" + assert return_code == 0, f"Failed to get aproxy logs {stdout} {stderr}" return stdout -async def _curl_as_ubuntu_user(unit: Unit, runner_name: str, url: str) -> tuple[int, str | None]: +async def _curl_as_ubuntu_user( + unit: Unit, runner_name: str, url: str +) -> tuple[int, str | None, str | None]: """Run curl as a logged in ubuntu user. This should simulate the bevahiour of a curl inside the runner with environment variables set. @@ -314,7 +316,7 @@ async def _curl_as_ubuntu_user(unit: Unit, runner_name: str, url: str) -> tuple[ url: The URL to curl. Returns: - The return code and stdout of the curl command. + The return code, stdout, stderr of the curl command. """ return await run_in_lxd_instance( unit, @@ -350,20 +352,22 @@ async def test_usage_of_aproxy(model: Model, app: Application, proxy_logs_filepa _clear_tinyproxy_log(proxy_logs_filepath) # 1. URL with standard port, should succeed, gets intercepted by aproxy - return_code, stdout = await _curl_as_ubuntu_user(unit, runner_name, "http://canonical.com") + return_code, stdout, stderr = await _curl_as_ubuntu_user( + unit, runner_name, "http://canonical.com" + ) assert ( return_code == 0 - ), f"Expected successful connection to http://canonical.com. Error msg: {stdout}" + ), f"Expected successful connection to http://canonical.com. Error msg: {stdout} {stderr}" # 2. URL with non-standard port, should fail, request does not get intercepted by aproxy - return_code, stdout = await _curl_as_ubuntu_user( + return_code, stdout, stderr = await _curl_as_ubuntu_user( unit, runner_name, f"http://canonical.com:{NON_STANDARD_PORT}", ) assert return_code == 7, ( f"Expected cannot connect error for http://canonical.com:{NON_STANDARD_PORT}. " - f"Error msg: {stdout}" + f"Error msg: {stdout} {stderr}" ) aproxy_logs = await _get_aproxy_logs(unit, runner_name) @@ -407,10 +411,12 @@ async def test_use_proxy_without_aproxy( _clear_tinyproxy_log(proxy_logs_filepath) # 1. URL with standard port, should succeed - return_code, stdout = await _curl_as_ubuntu_user(unit, runner_name, "http://canonical.com") + return_code, stdout, stderr = await _curl_as_ubuntu_user( + unit, runner_name, "http://canonical.com" + ) assert ( return_code == 0 - ), f"Expected successful connection to http://canonical.com. Error msg: {stdout}" + ), f"Expected successful connection to http://canonical.com. Error msg: {stdout} {stderr}" # 2. URL with non-standard port, should return an error message by the proxy like this: # @@ -423,14 +429,14 @@ async def test_use_proxy_without_aproxy( #

Generated by tinyproxy version 1.11.0.

# # - return_code, stdout = await _curl_as_ubuntu_user( + return_code, stdout, stderr = await _curl_as_ubuntu_user( unit, runner_name, f"http://canonical.com:{NON_STANDARD_PORT}", ) assert return_code == 0, ( f"Expected error response from proxy for http://canonical.com:{NON_STANDARD_PORT}. " - f"Error msg: {stdout}" + f"Error msg: {stdout} {stderr}" ) proxy_logs = proxy_logs_filepath.read_text(encoding="utf-8") diff --git a/tests/integration/test_self_hosted_runner.py b/tests/integration/test_self_hosted_runner.py index bf76ac68d..95ef034a5 100644 --- a/tests/integration/test_self_hosted_runner.py +++ b/tests/integration/test_self_hosted_runner.py @@ -57,10 +57,10 @@ async def test_dispatch_workflow_with_dockerhub_mirror( runner_to_be_used = names[0] - return_code, stdout = await run_in_lxd_instance( + return_code, stdout, stderr = await run_in_lxd_instance( unit, runner_to_be_used, "cat /etc/docker/daemon.json" ) - assert return_code == 0 + assert return_code == 0, f"Failed to get docker daemon contents, {stdout} {stderr}" assert stdout is not None assert "registry-mirrors" in stdout assert fake_url in stdout From aed2b40aafe0c335dfd1e704dd3fdd4ad068f0d9 Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Tue, 26 Mar 2024 23:06:19 +0800 Subject: [PATCH 06/19] log apt source list error --- src-docs/runner_manager.py.md | 2 +- src/runner_manager.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src-docs/runner_manager.py.md b/src-docs/runner_manager.py.md index 1b4c074bd..f9cb44f9b 100644 --- a/src-docs/runner_manager.py.md +++ b/src-docs/runner_manager.py.md @@ -175,7 +175,7 @@ Bring runners in line with target. --- - + ### function `schedule_build_runner_image` diff --git a/src/runner_manager.py b/src/runner_manager.py index 8a8992b1c..7e3fafa96 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -25,7 +25,7 @@ import runner_metrics import shared_fs from charm_state import VirtualMachineResources -from errors import IssueMetricEventError, RunnerBinaryError, RunnerCreateError +from errors import IssueMetricEventError, RunnerBinaryError, RunnerCreateError, SubprocessError from github_client import GithubClient from github_type import RunnerApplication, SelfHostedRunner from lxd import LxdClient, LxdInstance @@ -746,7 +746,15 @@ def build_runner_image(self) -> None: Raises: LxdError: Unable to build the LXD image. """ - execute_command(self._build_image_command()) + try: + execute_command(self._build_image_command()) + # This is for debugging common error in test with build image. + # This should be removed before merging. + except SubprocessError as exc: + if "Type 'ubuntu' is not known on line 43 in source lis" in str(exc): + stdout, ret_code = execute_command(["cat", "/etc/apt/sources.list"]) + logger.critical("Failed to execute build image command: %s %s", stdout, ret_code) + raise def schedule_build_runner_image(self) -> None: """Install cron job for building runner image.""" From 5333de0e0238b99c8c6e8f770323d3f4a93eaf1a Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Wed, 27 Mar 2024 02:46:27 +0800 Subject: [PATCH 07/19] log before error --- src-docs/runner_manager.py.md | 2 +- src/runner_manager.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src-docs/runner_manager.py.md b/src-docs/runner_manager.py.md index f9cb44f9b..cf390d14b 100644 --- a/src-docs/runner_manager.py.md +++ b/src-docs/runner_manager.py.md @@ -175,7 +175,7 @@ Bring runners in line with target. --- - + ### function `schedule_build_runner_image` diff --git a/src/runner_manager.py b/src/runner_manager.py index 7e3fafa96..d4710c4ca 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -747,11 +747,13 @@ def build_runner_image(self) -> None: LxdError: Unable to build the LXD image. """ try: + stdout, ret_code = execute_command(["cat", "/etc/apt/sources.list"]) + logger.error("Before build image command: %s %s", stdout, ret_code) execute_command(self._build_image_command()) # This is for debugging common error in test with build image. # This should be removed before merging. except SubprocessError as exc: - if "Type 'ubuntu' is not known on line 43 in source lis" in str(exc): + if "Type 'ubuntu' is not known on line 43 in source list" in str(exc): stdout, ret_code = execute_command(["cat", "/etc/apt/sources.list"]) logger.critical("Failed to execute build image command: %s %s", stdout, ret_code) raise From 56c8261914c9089e40585a0ee68b250550597102 Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Wed, 27 Mar 2024 02:48:35 +0800 Subject: [PATCH 08/19] log bulider sources.list --- src/runner_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/runner_manager.py b/src/runner_manager.py index d4710c4ca..ab376aa23 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -747,14 +747,14 @@ def build_runner_image(self) -> None: LxdError: Unable to build the LXD image. """ try: - stdout, ret_code = execute_command(["cat", "/etc/apt/sources.list"]) - logger.error("Before build image command: %s %s", stdout, ret_code) execute_command(self._build_image_command()) # This is for debugging common error in test with build image. # This should be removed before merging. except SubprocessError as exc: if "Type 'ubuntu' is not known on line 43 in source list" in str(exc): - stdout, ret_code = execute_command(["cat", "/etc/apt/sources.list"]) + stdout, ret_code = execute_command( + ["/snap/bin/lxc", "exec", "builder", "--", "cat /etc/apt/sources.list"] + ) logger.critical("Failed to execute build image command: %s %s", stdout, ret_code) raise From edebfe394bc476d681150f17816dbdb5e596314b Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Wed, 27 Mar 2024 10:02:44 +0800 Subject: [PATCH 09/19] apt exec command fix --- src/runner_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runner_manager.py b/src/runner_manager.py index ab376aa23..119dcade6 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -753,7 +753,7 @@ def build_runner_image(self) -> None: except SubprocessError as exc: if "Type 'ubuntu' is not known on line 43 in source list" in str(exc): stdout, ret_code = execute_command( - ["/snap/bin/lxc", "exec", "builder", "--", "cat /etc/apt/sources.list"] + ["/snap/bin/lxc", "exec", "builder", "cat", "/etc/apt/sources.list"] ) logger.critical("Failed to execute build image command: %s %s", stdout, ret_code) raise From 5c01e3db89c8d6d882411e6e06288c87dd91554a Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Wed, 27 Mar 2024 11:22:06 +0800 Subject: [PATCH 10/19] add tmate debug --- .github/workflows/integration_test.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 0b07f17f6..821d873c6 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -16,6 +16,8 @@ jobs: provider: lxd test-tox-env: integration-juju2.9 modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]' + tmate-debug: true + tmate-timeout: 90 integration-tests: name: Integration test with juju 3.1 uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@fix/upload-artifact @@ -26,6 +28,8 @@ jobs: provider: lxd test-tox-env: integration-juju3.1 modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]' + tmate-debug: true + tmate-timeout: 90 # openstack tests use microstack, whose setup is kind of special # - due to the huge resource requirements, we use self-hosted runners for these tests # - microstack requires juju 3.2 and microk8s 1.26 From 79debedc50851b612aa5f9d351fe54e989aa9526 Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Wed, 27 Mar 2024 15:35:13 +0800 Subject: [PATCH 11/19] retry on assertion error --- tests/integration/helpers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 04f24bcbd..34e6f52e8 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -5,6 +5,7 @@ import inspect import json +import logging import subprocess import time import typing @@ -40,6 +41,8 @@ DEFAULT_RUNNER_CONSTRAINTS = {"root-disk": 15} +logger = logging.getLogger(__name__) + async def check_runner_binary_exists(unit: Unit) -> bool: """Checks if runner binary exists in the charm. @@ -157,7 +160,7 @@ async def get_runner_names(unit: Unit) -> tuple[str, ...]: return tuple(runner["name"] for runner in lxc_instance) -@retry(tries=30, delay=30) +@retry(exception=AssertionError, tries=30, delay=30, local_logger=logger) async def wait_till_num_of_runners(unit: Unit, num: int) -> None: """Wait and check the number of runners. From a6e3be5dc33f279a99069ba94c11513a60f8bce2 Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Wed, 27 Mar 2024 20:42:26 +0800 Subject: [PATCH 12/19] replace retry --- tests/integration/helpers.py | 41 ++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 34e6f52e8..3e8f640dd 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -31,7 +31,6 @@ from runner import Runner from runner_manager import RunnerManager from tests.status_name import ACTIVE -from utilities import retry DISPATCH_TEST_WORKFLOW_FILENAME = "workflow_dispatch_test.yaml" DISPATCH_CRASH_TEST_WORKFLOW_FILENAME = "workflow_dispatch_crash_test.yaml" @@ -160,8 +159,7 @@ async def get_runner_names(unit: Unit) -> tuple[str, ...]: return tuple(runner["name"] for runner in lxc_instance) -@retry(exception=AssertionError, tries=30, delay=30, local_logger=logger) -async def wait_till_num_of_runners(unit: Unit, num: int) -> None: +async def wait_till_num_of_runners(unit: Unit, num: int, timeout: int = 10 * 60) -> None: """Wait and check the number of runners. Args: @@ -172,16 +170,37 @@ async def wait_till_num_of_runners(unit: Unit, num: int) -> None: AssertionError: Correct number of runners is not found within timeout limit. """ - return_code, stdout, _ = await run_in_unit(unit, "lxc list --format json") - assert return_code == 0 - assert stdout is not None - lxc_instance = json.loads(stdout) - assert ( - len(lxc_instance) == num - ), f"Current number of runners: {len(lxc_instance)} Expected number of runner: {num}" + async def get_lxc_instances() -> None | list[dict]: + """Get lxc instances list info. + + Returns: + List of lxc instance dictionaries, None if failed to get list. + """ + return_code, stdout, _ = await run_in_unit(unit, "lxc list --format json") + if return_code != 0 or not stdout: + logger.error("Failed to run lxc list, %s", return_code) + return None + return json.loads(stdout) + + async def is_desired_num_runners(): + """Return whether there are desired number of lxc instances running. + + Returns: + Whether the desired number of lxc runners have been reached. + """ + lxc_instances = await get_lxc_instances() + if lxc_instances is None: + return False + return len(lxc_instances) == num + + await wait_for(is_desired_num_runners, timeout=timeout, check_interval=30) + + instances = await get_lxc_instances() + if not instances: + return - for instance in lxc_instance: + for instance in instances: return_code, stdout, _ = await run_in_unit(unit, f"lxc exec {instance['name']} -- ps aux") assert return_code == 0 From 197b28c7609631874ecc8e8ade4a8d32309c7989 Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Wed, 27 Mar 2024 22:01:50 +0800 Subject: [PATCH 13/19] filter builder from runner --- tests/integration/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 3e8f640dd..e8b2c2a17 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -156,7 +156,7 @@ async def get_runner_names(unit: Unit) -> tuple[str, ...]: assert stdout is not None lxc_instance: list[dict[str, str]] = json.loads(stdout) - return tuple(runner["name"] for runner in lxc_instance) + return tuple(runner["name"] for runner in lxc_instance if runner["name"] != "builder") async def wait_till_num_of_runners(unit: Unit, num: int, timeout: int = 10 * 60) -> None: From 475da6b8b66a1ce5a4b480188857e23219bc5b22 Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Wed, 27 Mar 2024 22:08:43 +0800 Subject: [PATCH 14/19] debug source.list bug --- src-docs/runner_manager.py.md | 2 +- src/runner_manager.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src-docs/runner_manager.py.md b/src-docs/runner_manager.py.md index cf390d14b..bba70e97f 100644 --- a/src-docs/runner_manager.py.md +++ b/src-docs/runner_manager.py.md @@ -175,7 +175,7 @@ Bring runners in line with target. --- - + ### function `schedule_build_runner_image` diff --git a/src/runner_manager.py b/src/runner_manager.py index 119dcade6..975ccf3dd 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -747,14 +747,16 @@ def build_runner_image(self) -> None: LxdError: Unable to build the LXD image. """ try: + # This is for debugging common error in test with build image. + # This should be removed before merging. + stdout, ret_code = execute_command( + ["/snap/bin/lxc", "exec", "builder", "cat", "/etc/apt/sources.list"] + ) execute_command(self._build_image_command()) # This is for debugging common error in test with build image. # This should be removed before merging. except SubprocessError as exc: if "Type 'ubuntu' is not known on line 43 in source list" in str(exc): - stdout, ret_code = execute_command( - ["/snap/bin/lxc", "exec", "builder", "cat", "/etc/apt/sources.list"] - ) logger.critical("Failed to execute build image command: %s %s", stdout, ret_code) raise From b5d487d291764e73c59f6825fa769a089f9d575c Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Thu, 28 Mar 2024 07:02:05 +0800 Subject: [PATCH 15/19] revert log --- src-docs/runner_manager.py.md | 2 +- src/runner_manager.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src-docs/runner_manager.py.md b/src-docs/runner_manager.py.md index bba70e97f..cf390d14b 100644 --- a/src-docs/runner_manager.py.md +++ b/src-docs/runner_manager.py.md @@ -175,7 +175,7 @@ Bring runners in line with target. --- - + ### function `schedule_build_runner_image` diff --git a/src/runner_manager.py b/src/runner_manager.py index 975ccf3dd..119dcade6 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -747,16 +747,14 @@ def build_runner_image(self) -> None: LxdError: Unable to build the LXD image. """ try: - # This is for debugging common error in test with build image. - # This should be removed before merging. - stdout, ret_code = execute_command( - ["/snap/bin/lxc", "exec", "builder", "cat", "/etc/apt/sources.list"] - ) execute_command(self._build_image_command()) # This is for debugging common error in test with build image. # This should be removed before merging. except SubprocessError as exc: if "Type 'ubuntu' is not known on line 43 in source list" in str(exc): + stdout, ret_code = execute_command( + ["/snap/bin/lxc", "exec", "builder", "cat", "/etc/apt/sources.list"] + ) logger.critical("Failed to execute build image command: %s %s", stdout, ret_code) raise From 23a139b65cf6fcf7748c0b6122b0954f1c4cc1ee Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Tue, 2 Apr 2024 13:29:30 +0800 Subject: [PATCH 16/19] revert debug commits --- .github/workflows/integration_test.yaml | 8 ++------ src-docs/runner_manager.py.md | 2 +- src/runner_manager.py | 14 ++------------ 3 files changed, 5 insertions(+), 19 deletions(-) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 821d873c6..046e4d921 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -16,11 +16,9 @@ jobs: provider: lxd test-tox-env: integration-juju2.9 modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]' - tmate-debug: true - tmate-timeout: 90 integration-tests: name: Integration test with juju 3.1 - uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@fix/upload-artifact + uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main secrets: inherit with: juju-channel: 3.1/stable @@ -28,8 +26,6 @@ jobs: provider: lxd test-tox-env: integration-juju3.1 modules: '["test_charm_fork_repo", "test_charm_no_runner", "test_charm_scheduled_events", "test_charm_one_runner", "test_charm_metrics_success", "test_charm_metrics_failure", "test_self_hosted_runner", "test_charm_with_proxy", "test_charm_with_juju_storage", "test_debug_ssh"]' - tmate-debug: true - tmate-timeout: 90 # openstack tests use microstack, whose setup is kind of special # - due to the huge resource requirements, we use self-hosted runners for these tests # - microstack requires juju 3.2 and microk8s 1.26 @@ -37,7 +33,7 @@ jobs: # - we need to disable the rbac addon for microk8s, otherwise the setup will fail integration-tests-microstack: name: Integration test using microstack - uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@fix/upload-artifact + uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main secrets: inherit with: juju-channel: 3.2/stable diff --git a/src-docs/runner_manager.py.md b/src-docs/runner_manager.py.md index cf390d14b..1b4c074bd 100644 --- a/src-docs/runner_manager.py.md +++ b/src-docs/runner_manager.py.md @@ -175,7 +175,7 @@ Bring runners in line with target. --- - + ### function `schedule_build_runner_image` diff --git a/src/runner_manager.py b/src/runner_manager.py index 119dcade6..8a8992b1c 100644 --- a/src/runner_manager.py +++ b/src/runner_manager.py @@ -25,7 +25,7 @@ import runner_metrics import shared_fs from charm_state import VirtualMachineResources -from errors import IssueMetricEventError, RunnerBinaryError, RunnerCreateError, SubprocessError +from errors import IssueMetricEventError, RunnerBinaryError, RunnerCreateError from github_client import GithubClient from github_type import RunnerApplication, SelfHostedRunner from lxd import LxdClient, LxdInstance @@ -746,17 +746,7 @@ def build_runner_image(self) -> None: Raises: LxdError: Unable to build the LXD image. """ - try: - execute_command(self._build_image_command()) - # This is for debugging common error in test with build image. - # This should be removed before merging. - except SubprocessError as exc: - if "Type 'ubuntu' is not known on line 43 in source list" in str(exc): - stdout, ret_code = execute_command( - ["/snap/bin/lxc", "exec", "builder", "cat", "/etc/apt/sources.list"] - ) - logger.critical("Failed to execute build image command: %s %s", stdout, ret_code) - raise + execute_command(self._build_image_command()) def schedule_build_runner_image(self) -> None: """Install cron job for building runner image.""" From 5bda3fd5ce676c6033287234b806e578e64c57b3 Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Tue, 2 Apr 2024 13:30:04 +0800 Subject: [PATCH 17/19] revert debug commits --- .github/workflows/integration_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 046e4d921..07a1ca231 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -8,7 +8,7 @@ jobs: # INTEGRATION_TEST_ARGS to operator-workflows automatically. integration-tests-juju2: name: Integration test with juju 2.9 - uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@fix/upload-artifact + uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main secrets: inherit with: juju-channel: 2.9/stable From a0bf2bcfcd50c4eae6e503301dc2a5c5a95e6881 Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Mon, 8 Apr 2024 11:03:42 +0800 Subject: [PATCH 18/19] add time since param --- tests/integration/charm_metrics_helpers.py | 14 ++++++++++++-- tests/integration/test_charm_metrics_failure.py | 6 +++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/tests/integration/charm_metrics_helpers.py b/tests/integration/charm_metrics_helpers.py index c12d92349..55d2f1d18 100644 --- a/tests/integration/charm_metrics_helpers.py +++ b/tests/integration/charm_metrics_helpers.py @@ -4,6 +4,7 @@ """Utilities for charm metrics integration tests.""" +import datetime import json import logging from time import sleep @@ -31,19 +32,28 @@ ] -async def wait_for_workflow_to_start(unit: Unit, workflow: Workflow, branch: Branch | None = None): +async def wait_for_workflow_to_start( + unit: Unit, workflow: Workflow, branch: Branch | None = None, started_time: float | None = None +): """Wait for the workflow to start. Args: unit: The unit which contains the runner. workflow: The workflow to wait for. branch: The branch where the workflow belongs to. + started_time: The time in seconds since epoch the job was started. """ runner_name = await get_runner_name(unit) + created_at = ( + None + if not started_time + # convert to integer since GH API takes up to seconds. + else f">={datetime.datetime.fromtimestamp(int(started_time)).isoformat()}" + ) def is_runner_log(): """Return whether a log for given runner exists.""" - for run in workflow.get_runs(branch=branch): + for run in workflow.get_runs(branch=branch, created=created_at): jobs = run.jobs() if not jobs: return False diff --git a/tests/integration/test_charm_metrics_failure.py b/tests/integration/test_charm_metrics_failure.py index a0941c7ca..933ebc5d2 100644 --- a/tests/integration/test_charm_metrics_failure.py +++ b/tests/integration/test_charm_metrics_failure.py @@ -2,6 +2,7 @@ # See LICENSE file for licensing details. """Integration tests for metrics/logs assuming Github workflow failures or a runner crash.""" +import time from typing import AsyncIterator import pytest @@ -108,9 +109,12 @@ async def test_charm_issues_metrics_for_abnormal_termination( workflow = forked_github_repository.get_workflow( id_or_file_name=DISPATCH_CRASH_TEST_WORKFLOW_FILENAME ) + dispatch_time = time.time() assert workflow.create_dispatch(forked_github_branch, {"runner": app.name}) - await wait_for_workflow_to_start(unit, workflow, branch=forked_github_branch) + await wait_for_workflow_to_start( + unit, workflow, branch=forked_github_branch, started_time=dispatch_time + ) # Make the runner terminate abnormally by killing run.sh runner_name = await get_runner_name(unit) From 19323f7a112eb4d37a66c97056c0ff5b191187ca Mon Sep 17 00:00:00 2001 From: charlie4284 Date: Mon, 8 Apr 2024 11:04:05 +0800 Subject: [PATCH 19/19] remove redundant exc --- src/charm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/charm.py b/src/charm.py index 0d9e72086..e7531bc81 100755 --- a/src/charm.py +++ b/src/charm.py @@ -286,8 +286,8 @@ def _ensure_service_health(self) -> None: logger.info("Checking health of repo-policy-compliance service") try: execute_command(["/usr/bin/systemctl", "is-active", "repo-policy-compliance"]) - except SubprocessError as exc: - logger.exception("Found inactive repo-policy-compliance service, %s", exc) + except SubprocessError: + logger.exception("Found inactive repo-policy-compliance service.") execute_command(["/usr/bin/systemctl", "restart", "repo-policy-compliance"]) logger.info("Restart repo-policy-compliance service") raise