From c379b0de04dae8637523c5467c346f59ad4d7419 Mon Sep 17 00:00:00 2001 From: Dragomir Penev <6687393+dragomirp@users.noreply.github.com> Date: Fri, 28 Jun 2024 22:27:56 +0300 Subject: [PATCH] [DPE-4799] Fix upgrade bugs when switching charms (#268) * Suppress rendering hooks when upgrading * Render auth file during upgrade * Add data integrator upgrade test * Fix data integrator upgrade test * Jimmyrig the upgrade lib * Upgrade two data-integrator subordinates * Missig wait in rollback test * Actually use the fault injection charm --- lib/charms/data_platform_libs/v0/upgrade.py | 4 +- src/charm.py | 20 +++- src/relations/peers.py | 5 + src/upgrade.py | 9 +- .../relations/pgbouncer_provider/helpers.py | 16 ++- .../test_data_integrator.py | 17 +-- tests/integration/test_upgrade.py | 8 +- .../test_upgrade_data_integrator.py | 107 ++++++++++++++++++ tests/unit/relations/test_peers.py | 1 + tests/unit/test_charm.py | 1 + 10 files changed, 165 insertions(+), 23 deletions(-) create mode 100644 tests/integration/test_upgrade_data_integrator.py diff --git a/lib/charms/data_platform_libs/v0/upgrade.py b/lib/charms/data_platform_libs/v0/upgrade.py index 18a58ff09..4d909d644 100644 --- a/lib/charms/data_platform_libs/v0/upgrade.py +++ b/lib/charms/data_platform_libs/v0/upgrade.py @@ -285,7 +285,7 @@ def restart(self, event) -> None: # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 17 +LIBPATCH = 18 PYDEPS = ["pydantic>=1.10,<2", "poetry-core"] @@ -921,7 +921,7 @@ def _on_upgrade_charm(self, event: UpgradeCharmEvent) -> None: self.charm.unit.status = WaitingStatus("other units upgrading first...") self.peer_relation.data[self.charm.unit].update({"state": "ready"}) - if self.charm.app.planned_units() == 1: + if len(self.app_units) == 1: # single unit upgrade, emit upgrade_granted event right away getattr(self.on, "upgrade_granted").emit() diff --git a/src/charm.py b/src/charm.py index 0111fa344..f723b9b36 100755 --- a/src/charm.py +++ b/src/charm.py @@ -25,7 +25,7 @@ from charms.tempo_k8s.v2.tracing import TracingEndpointRequirer from jinja2 import Template from ops import JujuVersion -from ops.charm import CharmBase +from ops.charm import CharmBase, StartEvent from ops.main import main from ops.model import ( ActiveStatus, @@ -396,11 +396,17 @@ def update_config(self) -> bool: return True - def _on_start(self, _) -> None: + def _on_start(self, event: StartEvent) -> None: """On Start hook. Runs pgbouncer through systemd (configured in src/pgbouncer.service) """ + # Safeguard against starting while upgrading. + if not self.upgrade.idle: + logger.debug("Defer on_start: Cluster is upgrading") + event.defer() + return + # Done first to instantiate the snap's private tmp self.unit.set_workload_version(self.version) @@ -481,6 +487,10 @@ def _on_update_status(self, _) -> None: Sets BlockedStatus if we have no backend database; if we can't connect to a backend, this charm serves no purpose. """ + if not self.upgrade.idle: + logger.debug("Early exit on_update_status: Cluster is upgrading") + return + self.update_status() self.peers.update_leader() @@ -508,6 +518,11 @@ def _on_config_changed(self, event) -> None: Reads charm config values, generates derivative values, writes new pgbouncer config, and restarts pgbouncer to apply changes. """ + if not self.upgrade.idle: + logger.debug("Defer on_config_changed: Cluster is upgrading") + event.defer() + return + old_port = self.peers.app_databag.get("current_port") port_changed = old_port != str(self.config["listen_port"]) if port_changed and self._is_exposed: @@ -529,6 +544,7 @@ def _on_config_changed(self, event) -> None: logger.warning("Deferring on_config_changed: cannot set secret label") event.defer() return + if self.backend.postgres: self.render_prometheus_service() diff --git a/src/relations/peers.py b/src/relations/peers.py index 7c74e2f2f..86a42c474 100644 --- a/src/relations/peers.py +++ b/src/relations/peers.py @@ -126,6 +126,11 @@ def _on_changed(self, event: HookEvent): """If the current unit is a follower, write updated config and auth files to filesystem.""" self.unit_databag.update({ADDRESS_KEY: self.charm.unit_ip}) + if not self.charm.upgrade.idle: + logger.debug("Defer on_start: Cluster is upgrading") + event.defer() + return + self.update_leader() if auth_file := self.charm.get_secret(APP_SCOPE, AUTH_FILE_DATABAG_KEY): self.charm.render_auth_file(auth_file) diff --git a/src/upgrade.py b/src/upgrade.py index ff8a81f76..71f7978fa 100644 --- a/src/upgrade.py +++ b/src/upgrade.py @@ -20,7 +20,7 @@ from tenacity import Retrying, stop_after_attempt, wait_fixed from typing_extensions import override -from constants import PGB, SNAP_PACKAGES +from constants import APP_SCOPE, AUTH_FILE_DATABAG_KEY, PGB, SNAP_PACKAGES DEFAULT_MESSAGE = "Pre-upgrade check failed and cannot safely upgrade" @@ -92,6 +92,13 @@ def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None: self.charm.create_instance_directories() self.charm.render_pgb_config() + if ( + (auth_file := self.charm.get_secret(APP_SCOPE, AUTH_FILE_DATABAG_KEY)) + and self.charm.backend.auth_user + and self.charm.backend.auth_user in auth_file + ): + self.charm.render_auth_file(auth_file) + self.charm.render_utility_files() self.charm.reload_pgbouncer() if self.charm.backend.postgres: diff --git a/tests/integration/relations/pgbouncer_provider/helpers.py b/tests/integration/relations/pgbouncer_provider/helpers.py index 4e9c8bd61..97e294fb6 100644 --- a/tests/integration/relations/pgbouncer_provider/helpers.py +++ b/tests/integration/relations/pgbouncer_provider/helpers.py @@ -4,11 +4,12 @@ import asyncio import json import logging -from typing import Optional +from typing import Dict, Optional from uuid import uuid4 import psycopg2 import yaml +from juju.unit import Unit from pytest_operator.plugin import OpsTest from tenacity import Retrying, stop_after_attempt, wait_fixed @@ -161,6 +162,19 @@ async def check_new_relation(ops_test: OpsTest, unit_name, relation_id, relation assert smoke_val in json.loads(run_update_query["results"])[0] +async def fetch_action_get_credentials(unit: Unit) -> Dict: + """Helper to run an action to fetch connection info. + + Args: + unit: The juju unit on which to run the get_credentials action for credentials + Returns: + A dictionary with the username, password and access info for the service + """ + action = await unit.run_action(action_name="get-credentials") + result = await action.wait() + return result.results + + def check_exposed_connection(credentials, tls): table_name = "expose_test" smoke_val = str(uuid4()) diff --git a/tests/integration/relations/pgbouncer_provider/test_data_integrator.py b/tests/integration/relations/pgbouncer_provider/test_data_integrator.py index e92024daf..1199c4635 100644 --- a/tests/integration/relations/pgbouncer_provider/test_data_integrator.py +++ b/tests/integration/relations/pgbouncer_provider/test_data_integrator.py @@ -3,10 +3,8 @@ # See LICENSE file for licensing details. import asyncio import logging -from typing import Dict import pytest -from juju.unit import Unit from pytest_operator.plugin import OpsTest from constants import BACKEND_RELATION_NAME, PGB_CONF_DIR @@ -19,7 +17,7 @@ get_unit_cores, ) from ...juju_ import juju_major_version -from .helpers import check_exposed_connection +from .helpers import check_exposed_connection, fetch_action_get_credentials logger = logging.getLogger(__name__) @@ -41,19 +39,6 @@ tls_config = {"ca-common-name": "Test CA"} -async def fetch_action_get_credentials(unit: Unit) -> Dict: - """Helper to run an action to fetch connection info. - - Args: - unit: The juju unit on which to run the get_credentials action for credentials - Returns: - A dictionary with the username, password and access info for the service - """ - action = await unit.run_action(action_name="get-credentials") - result = await action.wait() - return result.results - - @pytest.mark.group(1) @pytest.mark.abort_on_fail async def test_deploy_and_relate(ops_test: OpsTest, pgb_charm_jammy): diff --git a/tests/integration/test_upgrade.py b/tests/integration/test_upgrade.py index c96f0dae6..ed248e9a1 100644 --- a/tests/integration/test_upgrade.py +++ b/tests/integration/test_upgrade.py @@ -152,7 +152,7 @@ async def test_fail_and_rollback(ops_test, continuous_writes, pgb_charm_jammy) - application = ops_test.model.applications[PGB] logger.info("Refresh the charm") - await application.refresh(path=pgb_charm_jammy) + await application.refresh(path=fault_charm) logger.info("Wait for upgrade to fail") async with ops_test.fast_forward("60s"): @@ -171,6 +171,12 @@ async def test_fail_and_rollback(ops_test, continuous_writes, pgb_charm_jammy) - logger.info("Re-refresh the charm") await application.refresh(path=pgb_charm_jammy) + logger.info("Wait for upgrade to start") + await ops_test.model.block_until( + lambda: "waiting" in {unit.workload_status for unit in application.units}, + timeout=TIMEOUT, + ) + logger.info("Wait for application to recover") async with ops_test.fast_forward("60s"): await ops_test.model.wait_for_idle(apps=[PGB], status="active", timeout=TIMEOUT) diff --git a/tests/integration/test_upgrade_data_integrator.py b/tests/integration/test_upgrade_data_integrator.py new file mode 100644 index 000000000..cde238cbd --- /dev/null +++ b/tests/integration/test_upgrade_data_integrator.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +# Copyright 2024 Canonical Ltd. +# See LICENSE file for licensing details. +import asyncio +import logging + +import pytest +from pytest_operator.plugin import OpsTest + +from constants import BACKEND_RELATION_NAME + +from .helpers.helpers import ( + PG, + PGB, +) +from .helpers.postgresql_helpers import get_leader_unit +from .relations.pgbouncer_provider.helpers import ( + check_exposed_connection, + fetch_action_get_credentials, +) + +logger = logging.getLogger(__name__) + +TIMEOUT = 600 +DATA_INTEGRATOR_APP_NAME = "data-integrator" + + +@pytest.mark.group(1) +@pytest.mark.abort_on_fail +async def test_deploy_stable(ops_test: OpsTest, pgb_charm_jammy) -> None: + """Simple test to ensure that the PostgreSQL and application charms get deployed.""" + await asyncio.gather( + ops_test.model.deploy( + PG, + num_units=3, + channel="14/edge", + config={"profile": "testing"}, + ), + ops_test.model.deploy( + PGB, + channel="1/stable", + num_units=None, + ), + ops_test.model.deploy( + DATA_INTEGRATOR_APP_NAME, + num_units=2, + channel="latest/edge", + config={"database-name": "test-database"}, + ), + ) + logger.info("Wait for applications to become active") + + await ops_test.model.add_relation(f"{PGB}:{BACKEND_RELATION_NAME}", f"{PG}:database") + await ops_test.model.add_relation(DATA_INTEGRATOR_APP_NAME, PGB) + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle( + apps=[PG, PGB, DATA_INTEGRATOR_APP_NAME], status="active", timeout=1500 + ) + assert len(ops_test.model.applications[PG].units) == 3 + assert len(ops_test.model.applications[PGB].units) == 2 + + +@pytest.mark.group(1) +@pytest.mark.abort_on_fail +async def test_pre_upgrade_check(ops_test: OpsTest) -> None: + """Test that the pre-upgrade-check action runs successfully.""" + logger.info("Get leader unit") + leader_unit = await get_leader_unit(ops_test, PGB) + assert leader_unit is not None, "No leader unit found" + + logger.info("Run pre-upgrade-check action") + action = await leader_unit.run_action("pre-upgrade-check") + await action.wait() + + +@pytest.mark.group(1) +@pytest.mark.abort_on_fail +async def test_upgrade_from_stable(ops_test: OpsTest, pgb_charm_jammy): + """Test updating from stable channel.""" + credentials = await fetch_action_get_credentials( + ops_test.model.applications[DATA_INTEGRATOR_APP_NAME].units[0] + ) + check_exposed_connection(credentials, False) + + application = ops_test.model.applications[PGB] + actions = await application.get_actions() + + logger.info("Refresh the charm") + await application.refresh(path=pgb_charm_jammy) + + logger.info("Wait for upgrade to start") + await ops_test.model.block_until( + lambda: ("waiting" if "pre-upgrade-check" in actions else "maintenance") + in {unit.workload_status for unit in application.units}, + timeout=TIMEOUT, + ) + + logger.info("Wait for upgrade to complete") + async with ops_test.fast_forward("60s"): + await ops_test.model.wait_for_idle( + apps=[PGB], status="active", idle_period=30, timeout=TIMEOUT + ) + + credentials = await fetch_action_get_credentials( + ops_test.model.applications[DATA_INTEGRATOR_APP_NAME].units[0] + ) + check_exposed_connection(credentials, False) diff --git a/tests/unit/relations/test_peers.py b/tests/unit/relations/test_peers.py index d6e8d58b5..8ceecfd3e 100644 --- a/tests/unit/relations/test_peers.py +++ b/tests/unit/relations/test_peers.py @@ -25,6 +25,7 @@ def setUp(self): self.charm = self.harness.charm self.app = self.charm.app.name self.unit = self.charm.unit.name + self.harness.add_relation("upgrade", self.charm.app.name) self.rel_id = self.harness.add_relation(PEER_RELATION_NAME, self.charm.app.name) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index dcb9a3a48..8e2317d1e 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -50,6 +50,7 @@ def setUp(self): self.charm = self.harness.charm self.unit = self.harness.charm.unit + self.harness.add_relation("upgrade", self.charm.app.name) self.rel_id = self.harness.add_relation(PEER_RELATION_NAME, self.charm.app.name) @pytest.fixture