Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DPE-3684] Implement DA139 #663

Closed
wants to merge 46 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
e0a09ae
Three units scaling tests
dragomirp Oct 27, 2024
42899c1
Pass the machine name when stopping
dragomirp Oct 27, 2024
5d972ac
Parameterised test
dragomirp Oct 27, 2024
7123659
Parameterised test
dragomirp Oct 27, 2024
e06c077
Merge branch 'dpe-3684-reinitialise-raft' into dpe-3684-three-units
dragomirp Oct 27, 2024
c0fece5
Log roles removed
dragomirp Oct 27, 2024
634a9bb
Track candidate by cluster status
dragomirp Oct 27, 2024
5088114
Unit tests
dragomirp Oct 27, 2024
cc16d5f
Set maintanance status during reinit
dragomirp Oct 28, 2024
ae5b560
Degraded status message
dragomirp Oct 29, 2024
da33c75
Linting
dragomirp Oct 29, 2024
782979b
Merge branch 'dpe-3684-reinitialise-raft' into dpe-3684-three-units
dragomirp Nov 19, 2024
ebf1361
Merge branch 'dpe-3684-reinitialise-raft' into dpe-3684-three-units
dragomirp Nov 22, 2024
07fd969
Merge branch 'dpe-3684-reinitialise-raft' into dpe-3684-three-units
dragomirp Nov 25, 2024
5e18530
Try to reinit on update status
dragomirp Nov 26, 2024
1e17ced
Wrong parameter
dragomirp Nov 26, 2024
e224caf
Add experimental action
dragomirp Nov 26, 2024
2308e8c
Run reinit as part of the action for leader
dragomirp Nov 26, 2024
a20cf72
Merge branch 'dpe-3684-reinitialise-raft' into dpe-3684-three-units
dragomirp Dec 24, 2024
55a7a7e
Merge branch 'dpe-3684-three-units' into dpe-3684-reinit-action
dragomirp Dec 24, 2024
dd88198
Promote via existing action
dragomirp Dec 24, 2024
74b4d48
Switch action
dragomirp Dec 24, 2024
ffd6d58
Bump timeout
dragomirp Dec 24, 2024
e459d05
Tests
dragomirp Dec 26, 2024
5f3609f
Raise flag on read only primary
dragomirp Dec 30, 2024
df65bd0
Wrong call
dragomirp Dec 30, 2024
c062fc0
Fix tests
dragomirp Jan 4, 2025
86d14b9
GH build job workaround
dragomirp Jan 6, 2025
9b3bcfd
Merge branch 'dpe-3684-reinitialise-raft' into dpe-3684-three-units
dragomirp Jan 8, 2025
720acf0
Use a sync unit to run the restore action
dragomirp Jan 8, 2025
66088cb
Merge branch 'dpe-3684-reinitialise-raft' into dpe-3684-three-units
dragomirp Jan 19, 2025
a39bae9
Regular promote
dragomirp Jan 21, 2025
5f6ad5e
Switch test order
dragomirp Jan 21, 2025
b76412e
Merge branch 'main' into dpe-3684-three-units
dragomirp Jan 22, 2025
e1de0c4
Remove replica test
dragomirp Jan 22, 2025
f7cef49
Unit tests
dragomirp Jan 22, 2025
7d4bb84
Merge branch 'dpe-3684-reinitialise-raft' into dpe-3684-three-units
dragomirp Jan 22, 2025
b3f44bc
Merge branch 'dpe-3684-reinitialise-raft' into dpe-3684-three-units
dragomirp Jan 23, 2025
ecb70d5
Check for Patroni self-healing
dragomirp Jan 23, 2025
caf076f
Correct timeout exception
dragomirp Jan 23, 2025
84c6601
Merge branch 'dpe-3684-reinitialise-raft' into dpe-3684-three-units
dragomirp Jan 23, 2025
9c00278
Remove dead code
dragomirp Jan 23, 2025
d084026
Merge branch 'dpe-3684-reinitialise-raft' into dpe-3684-three-units
dragomirp Jan 27, 2025
7d2070a
Merge branch 'dpe-3684-reinitialise-raft' into dpe-3684-three-units
dragomirp Jan 29, 2025
adb8ba9
Missed auto recovery
dragomirp Jan 29, 2025
72ddcac
Timeout check
dragomirp Jan 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion actions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,13 @@ list-backups:
pre-upgrade-check:
description: Run necessary pre-upgrade checks and preparations before executing a charm refresh.
promote-to-primary:
description: Promotes the cluster of choice to a primary cluster. Must be ran against the leader unit.
description: Promotes the cluster of choice to a primary cluster. Must be ran against the leader unit when promoting a cluster
or against the unit to be promoted within the cluster.
params:
scope:
type: string
default: cluster
description: Whether to promote a unit or a cluster. Must be set to either unit or cluster.
force:
type: boolean
description: Force the promotion of a cluster when there is already a primary cluster.
Expand Down
45 changes: 41 additions & 4 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ def __init__(self, *args):
self.framework.observe(self.on.start, self._on_start)
self.framework.observe(self.on.get_password_action, self._on_get_password)
self.framework.observe(self.on.set_password_action, self._on_set_password)
self.framework.observe(self.on.promote_to_primary_action, self._on_promote_to_primary)
self.framework.observe(self.on.update_status, self._on_update_status)
self.cluster_name = self.app.name
self._member_name = self.unit.name.replace("/", "-")
Expand Down Expand Up @@ -631,6 +632,7 @@ def _raft_reinitialisation(self) -> None:
and "raft_primary" not in self.unit_peer_data
and "raft_followers_stopped" in self.app_peer_data
):
self.unit.status = MaintenanceStatus("Reinitialising raft")
logger.info(f"Reinitialising {self.unit.name} as primary")
self._patroni.reinitialise_raft_data()
self.unit_peer_data["raft_primary"] = "True"
Expand All @@ -644,6 +646,7 @@ def _raft_reinitialisation(self) -> None:
self.unit_peer_data.pop("raft_stopped", None)
self.update_config()
self._patroni.start_patroni()
self._set_primary_status_message()

if self.unit.is_leader():
self._stuck_raft_cluster_cleanup()
Expand Down Expand Up @@ -1506,6 +1509,32 @@ def _on_set_password(self, event: ActionEvent) -> None:

event.set_results({"password": password})

def _on_promote_to_primary(self, event: ActionEvent) -> None:
if event.params.get("scope") == "cluster":
return self.async_replication.promote_to_primary(event)
elif event.params.get("scope") == "unit":
return self.promote_primary_unit(event)
else:
event.fail("Scope should be either cluster or unit")

def promote_primary_unit(self, event: ActionEvent) -> None:
"""Handles promote to primary for unit scope."""
if event.params.get("force"):
if self.has_raft_keys():
self.unit_peer_data.update({"raft_candidate": "True"})
if self.unit.is_leader():
self._raft_reinitialisation()
return
event.fail("Raft is not stuck")
else:
if self.has_raft_keys():
event.fail("Raft is stuck. Set force to reinitialise with new primary")
return
try:
self._patroni.switchover(self._member_name)
except SwitchoverFailedError:
event.fail("Unit is not sync standby")

def _on_update_status(self, _) -> None:
"""Update the unit status message and users list in the database."""
if not self._can_run_on_update_status():
Expand Down Expand Up @@ -1675,10 +1704,18 @@ def _set_primary_status_message(self) -> None:
self.app_peer_data["s3-initialization-block-message"]
)
return
if self._patroni.get_primary(unit_name_pattern=True) == self.unit.name:
self.unit.status = ActiveStatus("Primary")
elif self.is_standby_leader:
self.unit.status = ActiveStatus("Standby")
if (
self._patroni.get_primary(unit_name_pattern=True) == self.unit.name
or self.is_standby_leader
):
danger_state = ""
if not self._patroni.has_raft_quorum():
danger_state = " (read-only)"
elif len(self._patroni.get_running_cluster_members()) < self.app.planned_units():
danger_state = " (degraded)"
self.unit.status = ActiveStatus(
f"{'Standby' if self.is_standby_leader else 'Primary'}{danger_state}"
)
elif self._patroni.member_started:
self.unit.status = ActiveStatus()
except (RetryError, ConnectionError) as e:
Expand Down
47 changes: 35 additions & 12 deletions src/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import requests
from charms.operator_libs_linux.v2 import snap
from jinja2 import Template
from ops import BlockedStatus
from pysyncobj.utility import TcpUtility, UtilityException
from tenacity import (
AttemptManager,
Expand Down Expand Up @@ -746,15 +747,18 @@ def stop_patroni(self) -> bool:
logger.exception(error_message, exc_info=e)
return False

def switchover(self) -> None:
def switchover(self, candidate: str | None = None) -> None:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pass a candidate when promoting a specific unit.

"""Trigger a switchover."""
# Try to trigger the switchover.
for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)):
with attempt:
current_primary = self.get_primary()
body = {"leader": current_primary}
if candidate:
body["candidate"] = candidate
r = requests.post(
f"{self._patroni_url}/switchover",
json={"leader": current_primary},
json=body,
verify=self.verify,
auth=self._patroni_auth,
timeout=PATRONI_TIMEOUT,
Expand All @@ -774,6 +778,19 @@ def primary_changed(self, old_primary: str) -> bool:
primary = self.get_primary()
return primary != old_primary

def has_raft_quorum(self) -> bool:
"""Check if raft cluster has quorum."""
# Get the status of the raft cluster.
syncobj_util = TcpUtility(password=self.raft_password, timeout=3)

raft_host = "127.0.0.1:2222"
try:
raft_status = syncobj_util.executeCommand(raft_host, ["status"])
except UtilityException:
logger.warning("Has raft quorum: Cannot connect to raft cluster")
return False
return raft_status["has_quorum"]

def remove_raft_data(self) -> None:
"""Stops Patroni and removes the raft journals."""
logger.info("Stopping patroni")
Expand Down Expand Up @@ -827,6 +844,21 @@ def reinitialise_raft_data(self) -> None:
raise RaftPostgresqlNotUpError()
logger.info("Raft should be unstuck")

def get_running_cluster_members(self) -> list[str]:
"""List running patroni members."""
try:
members = requests.get(
f"{self._patroni_url}/{PATRONI_CLUSTER_STATUS_ENDPOINT}",
verify=self.verify,
timeout=API_REQUEST_TIMEOUT,
auth=self._patroni_auth,
).json()["members"]
return [
member["name"] for member in members if member["state"] in ("streaming", "running")
]
except Exception:
return []

def remove_raft_member(self, member_ip: str) -> None:
"""Remove a member from the raft cluster.

Expand Down Expand Up @@ -860,18 +892,9 @@ def remove_raft_member(self, member_ip: str) -> None:
if not raft_status["has_quorum"] and (
not raft_status["leader"] or raft_status["leader"].host == member_ip
):
self.charm.unit.status = BlockedStatus("Raft majority loss, run: promote-to-primary")
logger.warning("Remove raft member: Stuck raft cluster detected")
data_flags = {"raft_stuck": "True"}
try:
health_status = self.get_patroni_health()
except Exception:
logger.warning("Remove raft member: Unable to get health status")
health_status = {}
if health_status.get("role") in ("leader", "master") or health_status.get(
"sync_standby"
):
logger.info(f"{self.charm.unit.name} is raft candidate")
data_flags["raft_candidate"] = "True"
Comment on lines -865 to -874
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait for the action to start reinit

self.charm.unit_peer_data.update(data_flags)

# Leader doesn't always trigger when changing it's own peer data.
Expand Down
5 changes: 1 addition & 4 deletions src/relations/async_replication.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,6 @@ def __init__(self, charm):
self.framework.observe(
self.charm.on.create_replication_action, self._on_create_replication
)
self.framework.observe(
self.charm.on.promote_to_primary_action, self._on_promote_to_primary
)
Comment on lines -109 to -111
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved to the main charm code, since it's no longer used only for async promotion.


self.framework.observe(self.charm.on.secret_changed, self._on_secret_changed)

Expand Down Expand Up @@ -583,7 +580,7 @@ def _on_create_replication(self, event: ActionEvent) -> None:
# Set the status.
self.charm.unit.status = MaintenanceStatus("Creating replication...")

def _on_promote_to_primary(self, event: ActionEvent) -> None:
def promote_to_primary(self, event: ActionEvent) -> None:
"""Promote this cluster to the primary cluster."""
if (
self.charm.app.status.message != READ_ONLY_MODE_BLOCKING_MESSAGE
Expand Down
38 changes: 38 additions & 0 deletions tests/integration/ha_tests/test_scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,24 @@ async def test_removing_stereo_primary(ops_test: OpsTest, continuous_writes) ->
primary = await get_primary(ops_test, app)
await ops_test.model.destroy_unit(primary, force=True, destroy_storage=False, max_wait=1500)

left_unit = ops_test.model.units[original_roles["sync_standbys"][0]]
for left_unit in ops_test.model.applications[DATABASE_APP_NAME].units:
if left_unit.name not in original_roles["primaries"]:
break

await ops_test.model.block_until(
lambda: left_unit.workload_status == "blocked"
and left_unit.workload_status_message == "Raft majority loss, run: promote-to-primary",
timeout=600,
)

run_action = (
await ops_test.model.applications[DATABASE_APP_NAME]
.units[0]
.run_action("promote-to-primary", scope="unit", force=True)
)
await run_action.wait()

await ops_test.model.wait_for_idle(status="active", timeout=600, idle_period=45)

await are_writes_increasing(ops_test, primary)
Expand Down Expand Up @@ -154,6 +172,16 @@ async def test_removing_raft_majority(ops_test: OpsTest, continuous_writes) -> N
),
)

left_unit = ops_test.model.units[original_roles["sync_standbys"][1]]
await ops_test.model.block_until(
lambda: left_unit.workload_status == "blocked"
and left_unit.workload_status_message == "Raft majority loss, run: promote-to-primary",
timeout=600,
)

run_action = await left_unit.run_action("promote-to-primary", scope="unit", force=True)
await run_action.wait()

await ops_test.model.wait_for_idle(status="active", timeout=900, idle_period=45)

await are_writes_increasing(
Expand Down Expand Up @@ -202,6 +230,16 @@ async def test_removing_raft_majority_async(ops_test: OpsTest, continuous_writes
),
)

left_unit = ops_test.model.units[original_roles["sync_standbys"][0]]
await ops_test.model.block_until(
lambda: left_unit.workload_status == "blocked"
and left_unit.workload_status_message == "Raft majority loss, run: promote-to-primary",
timeout=600,
)

run_action = await left_unit.run_action("promote-to-primary", scope="unit", force=True)
await run_action.wait()

await ops_test.model.wait_for_idle(status="active", timeout=900, idle_period=45)

await are_writes_increasing(
Expand Down
134 changes: 134 additions & 0 deletions tests/integration/ha_tests/test_scaling_three_units.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#!/usr/bin/env python3
# Copyright 2024 Canonical Ltd.
# See LICENSE file for licensing details.
import logging
from asyncio import exceptions, gather, sleep

import pytest
from pytest_operator.plugin import OpsTest

from .. import markers
from ..helpers import (
CHARM_BASE,
DATABASE_APP_NAME,
get_machine_from_unit,
stop_machine,
)
from .conftest import APPLICATION_NAME
from .helpers import (
app_name,
are_writes_increasing,
check_writes,
get_cluster_roles,
start_continuous_writes,
)

logger = logging.getLogger(__name__)

charm = None


@pytest.mark.group(1)
@markers.juju3
@pytest.mark.abort_on_fail
async def test_build_and_deploy(ops_test: OpsTest) -> None:
"""Build and deploy two PostgreSQL clusters."""
# This is a potentially destructive test, so it shouldn't be run against existing clusters
charm = await ops_test.build_charm(".")
async with ops_test.fast_forward():
# Deploy the first cluster with reusable storage
await gather(
ops_test.model.deploy(
charm,
application_name=DATABASE_APP_NAME,
num_units=3,
base=CHARM_BASE,
config={"profile": "testing"},
),
ops_test.model.deploy(
APPLICATION_NAME,
application_name=APPLICATION_NAME,
base=CHARM_BASE,
channel="edge",
),
)

await ops_test.model.wait_for_idle(status="active", timeout=1500)


@pytest.mark.group(1)
@markers.juju3
@pytest.mark.parametrize(
"roles",
[
["primaries"],
["sync_standbys"],
["replicas"],
["primaries", "replicas"],
["sync_standbys", "replicas"],
],
)
@pytest.mark.abort_on_fail
async def test_removing_unit(ops_test: OpsTest, roles: list[str], continuous_writes) -> None:
logger.info(f"removing {', '.join(roles)}")
# Start an application that continuously writes data to the database.
app = await app_name(ops_test)
original_roles = await get_cluster_roles(
ops_test, ops_test.model.applications[DATABASE_APP_NAME].units[0].name
)
await start_continuous_writes(ops_test, app)
units = [original_roles[role][0] for role in roles]
for unit in units:
logger.info(f"Stopping unit {unit}")
await stop_machine(ops_test, await get_machine_from_unit(ops_test, unit))
await sleep(15)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sleep for the Juju leadership to drift.

for unit in units:
logger.info(f"Deleting unit {unit}")
await ops_test.model.destroy_unit(unit, force=True, destroy_storage=False, max_wait=1500)

if len(roles) > 1:
for left_unit in ops_test.model.applications[DATABASE_APP_NAME].units:
if left_unit.name not in units:
break
try:
await ops_test.model.block_until(
lambda: left_unit.workload_status == "blocked"
and left_unit.workload_status_message
== "Raft majority loss, run: promote-to-primary",
timeout=600,
)

run_action = (
await ops_test.model.applications[DATABASE_APP_NAME]
.units[0]
.run_action("promote-to-primary", scope="unit", force=True)
)
await run_action.wait()
except exceptions.TimeoutError:
# Check if Patroni self healed
assert (
left_unit.workload_status == "active"
and left_unit.workload_status_message == "Primary"
)
logger.warning(f"Patroni self-healed without raft reinitialisation for roles {roles}")
Comment on lines +108 to +113
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sometimes when removing the primary and async replica, Patroni manages to survive, so adding an exception for this case. Should I nail it down further?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there is no need for that.


await ops_test.model.wait_for_idle(status="active", timeout=600, idle_period=45)

await are_writes_increasing(ops_test, units)

logger.info("Scaling back up")
await ops_test.model.applications[DATABASE_APP_NAME].add_unit(count=len(roles))
await ops_test.model.wait_for_idle(status="active", timeout=1500)

new_roles = await get_cluster_roles(
ops_test, ops_test.model.applications[DATABASE_APP_NAME].units[0].name
)
assert len(new_roles["primaries"]) == 1
assert len(new_roles["sync_standbys"]) == 1
assert len(new_roles["replicas"]) == 1
if "primaries" in roles:
assert new_roles["primaries"][0] == original_roles["sync_standbys"][0]
else:
assert new_roles["primaries"][0] == original_roles["primaries"][0]

await check_writes(ops_test)
Loading
Loading