Skip to content

Commit

Permalink
[DPE-4799] Fix upgrade bugs when switching charms (#268)
Browse files Browse the repository at this point in the history
* Suppress rendering hooks when upgrading

* Render auth file during upgrade

* Add data integrator upgrade test

* Fix data integrator upgrade test

* Jimmyrig the upgrade lib

* Upgrade two data-integrator subordinates

* Missig wait in rollback test

* Actually use the fault injection charm
  • Loading branch information
dragomirp authored Jun 28, 2024
1 parent 6c83f6e commit c379b0d
Show file tree
Hide file tree
Showing 10 changed files with 165 additions and 23 deletions.
4 changes: 2 additions & 2 deletions lib/charms/data_platform_libs/v0/upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def restart(self, event) -> None:

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 17
LIBPATCH = 18

PYDEPS = ["pydantic>=1.10,<2", "poetry-core"]

Expand Down Expand Up @@ -921,7 +921,7 @@ def _on_upgrade_charm(self, event: UpgradeCharmEvent) -> None:
self.charm.unit.status = WaitingStatus("other units upgrading first...")
self.peer_relation.data[self.charm.unit].update({"state": "ready"})

if self.charm.app.planned_units() == 1:
if len(self.app_units) == 1:
# single unit upgrade, emit upgrade_granted event right away
getattr(self.on, "upgrade_granted").emit()

Expand Down
20 changes: 18 additions & 2 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from charms.tempo_k8s.v2.tracing import TracingEndpointRequirer
from jinja2 import Template
from ops import JujuVersion
from ops.charm import CharmBase
from ops.charm import CharmBase, StartEvent
from ops.main import main
from ops.model import (
ActiveStatus,
Expand Down Expand Up @@ -396,11 +396,17 @@ def update_config(self) -> bool:

return True

def _on_start(self, _) -> None:
def _on_start(self, event: StartEvent) -> None:
"""On Start hook.
Runs pgbouncer through systemd (configured in src/pgbouncer.service)
"""
# Safeguard against starting while upgrading.
if not self.upgrade.idle:
logger.debug("Defer on_start: Cluster is upgrading")
event.defer()
return

# Done first to instantiate the snap's private tmp
self.unit.set_workload_version(self.version)

Expand Down Expand Up @@ -481,6 +487,10 @@ def _on_update_status(self, _) -> None:
Sets BlockedStatus if we have no backend database; if we can't connect to a backend, this
charm serves no purpose.
"""
if not self.upgrade.idle:
logger.debug("Early exit on_update_status: Cluster is upgrading")
return

self.update_status()

self.peers.update_leader()
Expand Down Expand Up @@ -508,6 +518,11 @@ def _on_config_changed(self, event) -> None:
Reads charm config values, generates derivative values, writes new pgbouncer config, and
restarts pgbouncer to apply changes.
"""
if not self.upgrade.idle:
logger.debug("Defer on_config_changed: Cluster is upgrading")
event.defer()
return

old_port = self.peers.app_databag.get("current_port")
port_changed = old_port != str(self.config["listen_port"])
if port_changed and self._is_exposed:
Expand All @@ -529,6 +544,7 @@ def _on_config_changed(self, event) -> None:
logger.warning("Deferring on_config_changed: cannot set secret label")
event.defer()
return

if self.backend.postgres:
self.render_prometheus_service()

Expand Down
5 changes: 5 additions & 0 deletions src/relations/peers.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ def _on_changed(self, event: HookEvent):
"""If the current unit is a follower, write updated config and auth files to filesystem."""
self.unit_databag.update({ADDRESS_KEY: self.charm.unit_ip})

if not self.charm.upgrade.idle:
logger.debug("Defer on_start: Cluster is upgrading")
event.defer()
return

self.update_leader()
if auth_file := self.charm.get_secret(APP_SCOPE, AUTH_FILE_DATABAG_KEY):
self.charm.render_auth_file(auth_file)
Expand Down
9 changes: 8 additions & 1 deletion src/upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from tenacity import Retrying, stop_after_attempt, wait_fixed
from typing_extensions import override

from constants import PGB, SNAP_PACKAGES
from constants import APP_SCOPE, AUTH_FILE_DATABAG_KEY, PGB, SNAP_PACKAGES

DEFAULT_MESSAGE = "Pre-upgrade check failed and cannot safely upgrade"

Expand Down Expand Up @@ -92,6 +92,13 @@ def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None:

self.charm.create_instance_directories()
self.charm.render_pgb_config()
if (
(auth_file := self.charm.get_secret(APP_SCOPE, AUTH_FILE_DATABAG_KEY))
and self.charm.backend.auth_user
and self.charm.backend.auth_user in auth_file
):
self.charm.render_auth_file(auth_file)

self.charm.render_utility_files()
self.charm.reload_pgbouncer()
if self.charm.backend.postgres:
Expand Down
16 changes: 15 additions & 1 deletion tests/integration/relations/pgbouncer_provider/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
import asyncio
import json
import logging
from typing import Optional
from typing import Dict, Optional
from uuid import uuid4

import psycopg2
import yaml
from juju.unit import Unit
from pytest_operator.plugin import OpsTest
from tenacity import Retrying, stop_after_attempt, wait_fixed

Expand Down Expand Up @@ -161,6 +162,19 @@ async def check_new_relation(ops_test: OpsTest, unit_name, relation_id, relation
assert smoke_val in json.loads(run_update_query["results"])[0]


async def fetch_action_get_credentials(unit: Unit) -> Dict:
"""Helper to run an action to fetch connection info.
Args:
unit: The juju unit on which to run the get_credentials action for credentials
Returns:
A dictionary with the username, password and access info for the service
"""
action = await unit.run_action(action_name="get-credentials")
result = await action.wait()
return result.results


def check_exposed_connection(credentials, tls):
table_name = "expose_test"
smoke_val = str(uuid4())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@
# See LICENSE file for licensing details.
import asyncio
import logging
from typing import Dict

import pytest
from juju.unit import Unit
from pytest_operator.plugin import OpsTest

from constants import BACKEND_RELATION_NAME, PGB_CONF_DIR
Expand All @@ -19,7 +17,7 @@
get_unit_cores,
)
from ...juju_ import juju_major_version
from .helpers import check_exposed_connection
from .helpers import check_exposed_connection, fetch_action_get_credentials

logger = logging.getLogger(__name__)

Expand All @@ -41,19 +39,6 @@
tls_config = {"ca-common-name": "Test CA"}


async def fetch_action_get_credentials(unit: Unit) -> Dict:
"""Helper to run an action to fetch connection info.
Args:
unit: The juju unit on which to run the get_credentials action for credentials
Returns:
A dictionary with the username, password and access info for the service
"""
action = await unit.run_action(action_name="get-credentials")
result = await action.wait()
return result.results


@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_deploy_and_relate(ops_test: OpsTest, pgb_charm_jammy):
Expand Down
8 changes: 7 additions & 1 deletion tests/integration/test_upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ async def test_fail_and_rollback(ops_test, continuous_writes, pgb_charm_jammy) -
application = ops_test.model.applications[PGB]

logger.info("Refresh the charm")
await application.refresh(path=pgb_charm_jammy)
await application.refresh(path=fault_charm)

logger.info("Wait for upgrade to fail")
async with ops_test.fast_forward("60s"):
Expand All @@ -171,6 +171,12 @@ async def test_fail_and_rollback(ops_test, continuous_writes, pgb_charm_jammy) -
logger.info("Re-refresh the charm")
await application.refresh(path=pgb_charm_jammy)

logger.info("Wait for upgrade to start")
await ops_test.model.block_until(
lambda: "waiting" in {unit.workload_status for unit in application.units},
timeout=TIMEOUT,
)

logger.info("Wait for application to recover")
async with ops_test.fast_forward("60s"):
await ops_test.model.wait_for_idle(apps=[PGB], status="active", timeout=TIMEOUT)
Expand Down
107 changes: 107 additions & 0 deletions tests/integration/test_upgrade_data_integrator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/usr/bin/env python3
# Copyright 2024 Canonical Ltd.
# See LICENSE file for licensing details.
import asyncio
import logging

import pytest
from pytest_operator.plugin import OpsTest

from constants import BACKEND_RELATION_NAME

from .helpers.helpers import (
PG,
PGB,
)
from .helpers.postgresql_helpers import get_leader_unit
from .relations.pgbouncer_provider.helpers import (
check_exposed_connection,
fetch_action_get_credentials,
)

logger = logging.getLogger(__name__)

TIMEOUT = 600
DATA_INTEGRATOR_APP_NAME = "data-integrator"


@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_deploy_stable(ops_test: OpsTest, pgb_charm_jammy) -> None:
"""Simple test to ensure that the PostgreSQL and application charms get deployed."""
await asyncio.gather(
ops_test.model.deploy(
PG,
num_units=3,
channel="14/edge",
config={"profile": "testing"},
),
ops_test.model.deploy(
PGB,
channel="1/stable",
num_units=None,
),
ops_test.model.deploy(
DATA_INTEGRATOR_APP_NAME,
num_units=2,
channel="latest/edge",
config={"database-name": "test-database"},
),
)
logger.info("Wait for applications to become active")

await ops_test.model.add_relation(f"{PGB}:{BACKEND_RELATION_NAME}", f"{PG}:database")
await ops_test.model.add_relation(DATA_INTEGRATOR_APP_NAME, PGB)
async with ops_test.fast_forward():
await ops_test.model.wait_for_idle(
apps=[PG, PGB, DATA_INTEGRATOR_APP_NAME], status="active", timeout=1500
)
assert len(ops_test.model.applications[PG].units) == 3
assert len(ops_test.model.applications[PGB].units) == 2


@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_pre_upgrade_check(ops_test: OpsTest) -> None:
"""Test that the pre-upgrade-check action runs successfully."""
logger.info("Get leader unit")
leader_unit = await get_leader_unit(ops_test, PGB)
assert leader_unit is not None, "No leader unit found"

logger.info("Run pre-upgrade-check action")
action = await leader_unit.run_action("pre-upgrade-check")
await action.wait()


@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_upgrade_from_stable(ops_test: OpsTest, pgb_charm_jammy):
"""Test updating from stable channel."""
credentials = await fetch_action_get_credentials(
ops_test.model.applications[DATA_INTEGRATOR_APP_NAME].units[0]
)
check_exposed_connection(credentials, False)

application = ops_test.model.applications[PGB]
actions = await application.get_actions()

logger.info("Refresh the charm")
await application.refresh(path=pgb_charm_jammy)

logger.info("Wait for upgrade to start")
await ops_test.model.block_until(
lambda: ("waiting" if "pre-upgrade-check" in actions else "maintenance")
in {unit.workload_status for unit in application.units},
timeout=TIMEOUT,
)

logger.info("Wait for upgrade to complete")
async with ops_test.fast_forward("60s"):
await ops_test.model.wait_for_idle(
apps=[PGB], status="active", idle_period=30, timeout=TIMEOUT
)

credentials = await fetch_action_get_credentials(
ops_test.model.applications[DATA_INTEGRATOR_APP_NAME].units[0]
)
check_exposed_connection(credentials, False)
1 change: 1 addition & 0 deletions tests/unit/relations/test_peers.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def setUp(self):
self.charm = self.harness.charm
self.app = self.charm.app.name
self.unit = self.charm.unit.name
self.harness.add_relation("upgrade", self.charm.app.name)

self.rel_id = self.harness.add_relation(PEER_RELATION_NAME, self.charm.app.name)

Expand Down
1 change: 1 addition & 0 deletions tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def setUp(self):
self.charm = self.harness.charm
self.unit = self.harness.charm.unit

self.harness.add_relation("upgrade", self.charm.app.name)
self.rel_id = self.harness.add_relation(PEER_RELATION_NAME, self.charm.app.name)

@pytest.fixture
Expand Down

0 comments on commit c379b0d

Please sign in to comment.