-
Notifications
You must be signed in to change notification settings - Fork 20
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DPE-3684] Reinitialise raft #611
base: main
Are you sure you want to change the base?
Changes from 86 commits
05833c7
5753d1f
e2b082e
df0c37e
c31c0d5
6ae0dcc
538f721
29aaf18
d050e6e
7626c99
c869331
774ea95
76bbe84
bdae203
fb922e5
e0bbb08
895a8c4
e739a56
6f71b36
ed56f3d
7c8c95c
c7b26d2
a4f0b39
02d08ff
b454d06
a281f7a
0f9ec81
c1f9596
47b2b6a
ad5b8e8
3951708
6afe354
243f1fb
73abcd9
15dfeb7
1717f27
d08eb1d
862660b
c3bcac1
90e0d83
507665d
0dfa199
1b8c2d8
ad2d7a9
a7bbe67
89c368f
9efc6ca
ec6f88c
de838b5
1c56adf
85d0ba3
c317104
887f008
24a3fe0
ac7e217
26a2bc6
63b5d2c
3a4eda7
bca3e3a
0e9686a
3974ea2
4048015
819748f
761d28d
0163017
db875ad
89a1fee
95c5884
09b3fea
5d253c7
92dd704
723306d
c86e7c7
42049d3
4fae252
c9b1211
cf77dfe
d59626e
400529a
45f98e6
c413f76
5a57952
2ee87ed
aa61f37
5396607
ca127cb
cf4e8ff
88edde1
d11b87e
546549b
5934150
2cb218e
439073b
1783170
a7912fc
bd67a39
ac6c13f
c769b8e
0896245
5535896
a307bd8
d155abf
6ffaf17
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -429,6 +429,12 @@ def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None: | |
) | ||
event.defer() | ||
return | ||
except RetryError: | ||
logger.warning( | ||
"Early exit on_peer_relation_departed: Cannot get %s member IP" | ||
% event.departing_unit.name | ||
) | ||
return | ||
|
||
# Allow leader to update the cluster members. | ||
if not self.unit.is_leader(): | ||
|
@@ -508,20 +514,163 @@ def _on_pgdata_storage_detaching(self, _) -> None: | |
if self.primary_endpoint: | ||
self._update_relation_endpoints() | ||
|
||
def _on_peer_relation_changed(self, event: HookEvent): | ||
"""Reconfigure cluster members when something changes.""" | ||
def _stuck_raft_cluster_check(self) -> None: | ||
"""Check for stuck raft cluster and reinitialise if safe.""" | ||
raft_stuck = False | ||
all_units_stuck = True | ||
candidate = self.app_peer_data.get("raft_selected_candidate") | ||
for key, data in self._peers.data.items(): | ||
if key == self.app: | ||
continue | ||
if "raft_stuck" in data: | ||
raft_stuck = True | ||
else: | ||
all_units_stuck = False | ||
if not candidate and "raft_candidate" in data: | ||
candidate = key | ||
|
||
if not raft_stuck: | ||
return | ||
|
||
if not all_units_stuck: | ||
logger.warning("Stuck raft not yet detected on all units") | ||
return | ||
|
||
if not candidate: | ||
logger.warning("Stuck raft has no candidate") | ||
return | ||
if "raft_selected_candidate" not in self.app_peer_data: | ||
logger.info("%s selected for new raft leader" % candidate.name) | ||
self.app_peer_data["raft_selected_candidate"] = candidate.name | ||
|
||
def _stuck_raft_cluster_rejoin(self) -> None: | ||
"""Reconnect cluster to new raft.""" | ||
primary = None | ||
for key, data in self._peers.data.items(): | ||
if key == self.app: | ||
continue | ||
if "raft_primary" in data: | ||
primary = key | ||
break | ||
if primary and "raft_reset_primary" not in self.app_peer_data: | ||
logger.info("Updating the primary endpoint") | ||
self.app_peer_data.pop("members_ips", None) | ||
self._add_to_members_ips(self._get_unit_ip(primary)) | ||
self.app_peer_data["raft_reset_primary"] = "True" | ||
self._update_relation_endpoints() | ||
if ( | ||
"raft_rejoin" not in self.app_peer_data | ||
and "raft_followers_stopped" in self.app_peer_data | ||
and "raft_reset_primary" in self.app_peer_data | ||
): | ||
logger.info("Notify units they can rejoin") | ||
self.app_peer_data["raft_rejoin"] = "True" | ||
|
||
def _stuck_raft_cluster_stopped_check(self) -> None: | ||
"""Check that the cluster is stopped.""" | ||
if "raft_followers_stopped" in self.app_peer_data: | ||
return | ||
|
||
for key, data in self._peers.data.items(): | ||
if key == self.app: | ||
continue | ||
if "raft_stopped" not in data: | ||
return | ||
|
||
logger.info("Cluster is shut down") | ||
self.app_peer_data["raft_followers_stopped"] = "True" | ||
|
||
def _stuck_raft_cluster_cleanup(self) -> None: | ||
for key, data in self._peers.data.items(): | ||
if key == self.app: | ||
continue | ||
for flag in data.keys(): | ||
if flag.startswith("raft_"): | ||
return | ||
|
||
logger.info("Cleaning up raft app data") | ||
self.app_peer_data.pop("raft_rejoin", None) | ||
self.app_peer_data.pop("raft_reset_primary", None) | ||
self.app_peer_data.pop("raft_selected_candidate", None) | ||
self.app_peer_data.pop("raft_followers_stopped", None) | ||
|
||
def _raft_reinitialisation(self) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If there's only one unit (sync standby and leader), this should execute in one go. |
||
"""Handle raft cluster loss of quorum.""" | ||
# Skip to cleanup if rejoining | ||
if "raft_rejoin" not in self.app_peer_data: | ||
if self.unit.is_leader(): | ||
self._stuck_raft_cluster_check() | ||
|
||
if ( | ||
candidate := self.app_peer_data.get("raft_selected_candidate") | ||
) and "raft_stopped" not in self.unit_peer_data: | ||
self.unit_peer_data.pop("raft_stuck", None) | ||
self.unit_peer_data.pop("raft_candidate", None) | ||
self._patroni.remove_raft_data() | ||
logger.info("Stopping %s" % self.unit.name) | ||
self.unit_peer_data["raft_stopped"] = "True" | ||
|
||
if self.unit.is_leader(): | ||
self._stuck_raft_cluster_stopped_check() | ||
|
||
if ( | ||
candidate == self.unit.name | ||
and "raft_primary" not in self.unit_peer_data | ||
and "raft_followers_stopped" in self.app_peer_data | ||
): | ||
logger.info("Reinitialising %s as primary" % self.unit.name) | ||
self._patroni.reinitialise_raft_data() | ||
self.unit_peer_data["raft_primary"] = "True" | ||
|
||
if self.unit.is_leader(): | ||
self._stuck_raft_cluster_rejoin() | ||
|
||
if "raft_rejoin" in self.app_peer_data: | ||
logger.info("Cleaning up raft unit data") | ||
self.unit_peer_data.pop("raft_primary", None) | ||
self.unit_peer_data.pop("raft_stopped", None) | ||
self._patroni.start_patroni() | ||
|
||
if self.unit.is_leader(): | ||
self._stuck_raft_cluster_cleanup() | ||
|
||
def has_raft_keys(self): | ||
"""Checks for the presence of raft recovery keys in peer data.""" | ||
for key in self.app_peer_data.keys(): | ||
if key.startswith("raft_"): | ||
return True | ||
|
||
for key in self.unit_peer_data.keys(): | ||
if key.startswith("raft_"): | ||
return True | ||
return False | ||
|
||
def _peer_relation_changed_checks(self, event: HookEvent) -> bool: | ||
"""Split of to reduce complexity.""" | ||
# Prevents the cluster to be reconfigured before it's bootstrapped in the leader. | ||
if "cluster_initialised" not in self._peers.data[self.app]: | ||
logger.debug("Deferring on_peer_relation_changed: cluster not initialized") | ||
event.defer() | ||
return | ||
return False | ||
|
||
# Check whether raft is stuck. | ||
if self.has_raft_keys(): | ||
self._raft_reinitialisation() | ||
logger.debug("Early exit on_peer_relation_changed: stuck raft recovery") | ||
return False | ||
Comment on lines
+662
to
+666
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will hijack execution until recovery completes. We should think of a ways to detect manual recovery. |
||
|
||
# If the unit is the leader, it can reconfigure the cluster. | ||
if self.unit.is_leader() and not self._reconfigure_cluster(event): | ||
event.defer() | ||
return | ||
return False | ||
|
||
if self._update_member_ip(): | ||
return False | ||
return True | ||
|
||
def _on_peer_relation_changed(self, event: HookEvent): | ||
"""Reconfigure cluster members when something changes.""" | ||
if not self._peer_relation_changed_checks(event): | ||
return | ||
|
||
# Don't update this member before it's part of the members list. | ||
|
@@ -563,7 +712,8 @@ def _on_peer_relation_changed(self, event: HookEvent): | |
# Restart the workload if it's stuck on the starting state after a timeline divergence | ||
# due to a backup that was restored. | ||
if ( | ||
not self.is_primary | ||
not self.has_raft_keys() | ||
and not self.is_primary | ||
and not self.is_standby_leader | ||
and ( | ||
self._patroni.member_replication_lag == "unknown" | ||
|
@@ -712,14 +862,16 @@ def add_cluster_member(self, member: str) -> None: | |
def _get_unit_ip(self, unit: Unit) -> Optional[str]: | ||
"""Get the IP address of a specific unit.""" | ||
# Check if host is current host. | ||
ip = None | ||
if unit == self.unit: | ||
return str(self.model.get_binding(PEER).network.bind_address) | ||
ip = self.model.get_binding(PEER).network.bind_address | ||
# Check if host is a peer. | ||
elif unit in self._peers.data: | ||
return str(self._peers.data[unit].get("private-address")) | ||
ip = self._peers.data[unit].get("private-address") | ||
# Return None if the unit is not a peer neither the current unit. | ||
else: | ||
return None | ||
if ip: | ||
return str(ip) | ||
return None | ||
|
||
@property | ||
def _hosts(self) -> set: | ||
|
@@ -911,6 +1063,10 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None: | |
if self.get_secret(APP_SCOPE, key) is None: | ||
self.set_secret(APP_SCOPE, key, new_password()) | ||
|
||
if self.has_raft_keys(): | ||
self._raft_reinitialisation() | ||
return | ||
|
||
# Update the list of the current PostgreSQL hosts when a new leader is elected. | ||
# Add this unit to the list of cluster members | ||
# (the cluster should start with only this member). | ||
|
@@ -1417,7 +1573,8 @@ def _handle_workload_failures(self) -> bool: | |
return False | ||
|
||
if ( | ||
not is_primary | ||
not self.has_raft_keys() | ||
and not is_primary | ||
and not is_standby_leader | ||
and not self._patroni.member_started | ||
and "postgresql_restarted" in self._peers.data[self.unit] | ||
|
@@ -1618,7 +1775,7 @@ def _can_connect_to_postgresql(self) -> bool: | |
return False | ||
return True | ||
|
||
def update_config(self, is_creating_backup: bool = False) -> bool: | ||
def update_config(self, is_creating_backup: bool = False, no_peers: bool = False) -> bool: | ||
"""Updates Patroni config file based on the existence of the TLS files.""" | ||
enable_tls = self.is_tls_enabled | ||
limit_memory = None | ||
|
@@ -1644,7 +1801,11 @@ def update_config(self, is_creating_backup: bool = False) -> bool: | |
self.app_peer_data.get("require-change-bucket-after-restore", None) | ||
), | ||
parameters=pg_parameters, | ||
no_peers=no_peers, | ||
) | ||
if no_peers: | ||
return True | ||
|
||
if not self._is_workload_running: | ||
# If Patroni/PostgreSQL has not started yet and TLS relations was initialised, | ||
# then mark TLS as enabled. This commonly happens when the charm is deployed | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can't proceed with automatic recovery if there's no sync standby, since the first unit on the new raft cluster will be leader and there may be data loss if promoting an async replica. We should consider setting a status and ways for manual recovery, if the user wants to promote a given replica anyway.