Skip to content

Commit

Permalink
[DPE-4667] check unit status before upgrade (#425)
Browse files Browse the repository at this point in the history
## Issue
Pre upgrade check doesn't check if all units in the deployment are
active (replica set and sharded clusters)

## Solution
Add a check for individual replica set to see if all units are Active /
Waiting for upgrade
Add a check for config-server to verify all units in the cluster are
Active / Waiting for upgrade
  • Loading branch information
MiaAltieri authored Jul 3, 2024
1 parent 0328543 commit ae2afe9
Show file tree
Hide file tree
Showing 12 changed files with 365 additions and 105 deletions.
18 changes: 11 additions & 7 deletions lib/charms/mongodb/v0/config_server_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 12
LIBPATCH = 13


class ClusterProvider(Object):
Expand Down Expand Up @@ -109,8 +109,10 @@ def _on_relation_changed(self, event) -> None:
"""Handles providing mongos with KeyFile and hosts."""
if not self.pass_hook_checks(event):
if not self.is_valid_mongos_integration():
self.charm.unit.status = BlockedStatus(
"Relation to mongos not supported, config role must be config-server"
self.charm.status.set_and_share_status(
BlockedStatus(
"Relation to mongos not supported, config role must be config-server"
)
)
logger.info("Skipping relation joined event: hook checks did not pass")
return
Expand Down Expand Up @@ -259,7 +261,9 @@ def _on_relation_changed(self, event) -> None:
event.relation.id, CONFIG_SERVER_DB_KEY
)
if not key_file_contents or not config_server_db_uri:
self.charm.unit.status = WaitingStatus("Waiting for secrets from config-server")
self.charm.status.set_and_share_status(
WaitingStatus("Waiting for secrets from config-server")
)
return

updated_keyfile = self.update_keyfile(key_file_contents=key_file_contents)
Expand All @@ -271,17 +275,17 @@ def _on_relation_changed(self, event) -> None:

# mongos is not available until it is using new secrets
logger.info("Restarting mongos with new secrets")
self.charm.unit.status = MaintenanceStatus("starting mongos")
self.charm.status.set_and_share_status(MaintenanceStatus("starting mongos"))
self.charm.restart_charm_services()

# restart on high loaded databases can be very slow (e.g. up to 10-20 minutes).
if not self.is_mongos_running():
logger.info("mongos has not started, deferring")
self.charm.unit.status = WaitingStatus("Waiting for mongos to start")
self.charm.status.set_and_share_status(WaitingStatus("Waiting for mongos to start"))
event.defer()
return

self.charm.unit.status = ActiveStatus()
self.charm.status.set_and_share_status(ActiveStatus())

def _on_relation_broken(self, event: RelationBrokenEvent) -> None:
# Only relation_deparated events can check if scaling down
Expand Down
12 changes: 6 additions & 6 deletions lib/charms/mongodb/v0/mongodb_tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 14
LIBPATCH = 15

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -174,10 +174,10 @@ def _on_tls_relation_broken(self, event: RelationBrokenEvent) -> None:
self.charm.config_server.update_ca_secret(new_ca=None)

logger.info("Restarting mongod with TLS disabled.")
self.charm.unit.status = MaintenanceStatus("disabling TLS")
self.charm.status.set_and_share_status(MaintenanceStatus("disabling TLS"))
self.charm.delete_tls_certificate_from_workload()
self.charm.restart_charm_services()
self.charm.unit.status = ActiveStatus()
self.charm.status.set_and_share_status(ActiveStatus())

def _on_certificate_available(self, event: CertificateAvailableEvent) -> None:
"""Enable TLS when TLS certificate available."""
Expand Down Expand Up @@ -224,16 +224,16 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None:

self.charm.delete_tls_certificate_from_workload()
self.charm.push_tls_certificate_to_workload()
self.charm.unit.status = MaintenanceStatus("enabling TLS")
self.charm.status.set_and_share_status(MaintenanceStatus("enabling TLS"))
self.charm.restart_charm_services()

if not self.charm.is_db_service_ready():
self.charm.unit.status = WaitingStatus("Waiting for MongoDB to start")
self.charm.status.set_and_share_status(WaitingStatus("Waiting for MongoDB to start"))
elif self.charm.unit.status == WaitingStatus(
"Waiting for MongoDB to start"
) or self.charm.unit.status == MaintenanceStatus("enabling TLS"):
# clear waiting status if db service is ready
self.charm.unit.status = ActiveStatus()
self.charm.status.set_and_share_status(ActiveStatus())

def waiting_for_certs(self):
"""Returns a boolean indicating whether additional certs are needed."""
Expand Down
124 changes: 124 additions & 0 deletions lib/charms/mongodb/v0/set_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/usr/bin/env python3
"""Code for handing statuses in the app and unit."""
# Copyright 2024 Canonical Ltd.
# See LICENSE file for licensing details.
import json

from ops.charm import CharmBase
from ops.framework import Object
from ops.model import ActiveStatus, StatusBase, WaitingStatus

from config import Config

# The unique Charmhub library identifier, never change it
LIBID = "9b0b9fac53244229aed5ffc5e62141eb"

# Increment this major API version when introducing breaking changes
LIBAPI = 0

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 1


class MongoDBStatusHandler(Object):
"""Verifies versions across multiple integrated applications."""

def __init__(
self,
charm: CharmBase,
) -> None:
"""Constructor for CrossAppVersionChecker.
Args:
charm: charm to inherit from.
"""
super().__init__(charm, None)
self.charm = charm

# TODO Future PR: handle update_status

# BEGIN Helpers

def set_and_share_status(self, status: StatusBase):
"""Sets the charm status and shares to app status and config-server if applicable."""
# TODO Future Feature/Epic: process other statuses, i.e. only set provided status if its
# appropriate.
self.charm.unit.status = status

self.set_app_status()

if self.charm.is_role(Config.Role.SHARD):
self.share_status_to_config_server()

def set_app_status(self):
"""TODO Future Feature/Epic: parse statuses and set a status for the entire app."""

def are_all_units_ready_for_upgrade(self) -> bool:
"""Returns True if all charm units status's show that they are ready for upgrade."""
goal_state = self.charm.model._backend._run(
"goal-state", return_output=True, use_json=True
)
is_different_revision = self.charm.get_cluster_mismatched_revision_status()
for _, unit_state in goal_state["units"].items():
if unit_state["status"] == "active":
continue
if unit_state["status"] != "waiting":
return False

if not is_different_revision:
return False

return True

def are_shards_status_ready_for_upgrade(self) -> bool:
"""Returns True if all integrated shards status's show that they are ready for upgrade.
A shard is ready for upgrade if it is either in the waiting for upgrade status or active
status.
"""
if not self.charm.is_role(Config.Role.CONFIG_SERVER):
return False

for sharding_relation in self.charm.config_server.get_all_sharding_relations():
for unit in sharding_relation.units:
unit_data = sharding_relation.data[unit]
status_ready_for_upgrade = json.loads(
unit_data.get(Config.Status.STATUS_READY_FOR_UPGRADE, None)
)
if not status_ready_for_upgrade:
return False

return True

def share_status_to_config_server(self):
"""Shares this shards status info to the config server."""
if not self.charm.is_role(Config.Role.SHARD):
return

if not (config_relation := self.charm.shard.get_config_server_relation()):
return

config_relation.data[self.charm.unit][Config.Status.STATUS_READY_FOR_UPGRADE] = json.dumps(
self.is_unit_status_ready_for_upgrade()
)

def is_unit_status_ready_for_upgrade(self) -> bool:
"""Returns True if the status of the current unit reflects that it is ready for upgrade."""
current_status = type(self.charm.unit.status)
status_message = self.charm.unit.status.message
if isinstance(current_status, ActiveStatus):
return True

if not isinstance(current_status, WaitingStatus):
return False

if (
status_message
and status_message != Config.Status.CONFIG_SERVER_WAITING_FOR_REFRESH.message
):
return False

return True

# END: Helpers
48 changes: 29 additions & 19 deletions lib/charms/mongodb/v1/mongodb_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 6
LIBPATCH = 7

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -136,8 +136,10 @@ def on_s3_relation_joined(self, event: RelationJoinedEvent) -> None:
logger.debug(
"Shard does not support s3 relations, please relate s3-integrator to config-server only."
)
self.charm.unit.status = BlockedStatus(
"Relation to s3-integrator is not supported, config role must be config-server"
self.charm.status.set_and_share_status(
BlockedStatus(
"Relation to s3-integrator is not supported, config role must be config-server"
)
)

def _on_s3_credential_changed(self, event: CredentialsChangedEvent):
Expand Down Expand Up @@ -190,7 +192,7 @@ def _on_create_backup_action(self, event) -> None:
# cannot create backup if pbm is not ready. This could be due to: resyncing, incompatible,
# options, incorrect credentials, or already creating a backup
pbm_status = self.get_pbm_status()
self.charm.unit.status = pbm_status
self.charm.status.set_and_share_status(pbm_status)

if isinstance(pbm_status, MaintenanceStatus):
self._fail_action_with_error_log(
Expand All @@ -214,8 +216,8 @@ def _on_create_backup_action(self, event) -> None:

try:
backup_id = self._try_to_backup()
self.charm.unit.status = MaintenanceStatus(
f"backup started/running, backup id:'{backup_id}'"
self.charm.status.set_and_share_status(
MaintenanceStatus(f"backup started/running, backup id:'{backup_id}'")
)
self._success_action_with_info_log(
event, action, {"backup-status": f"backup started. backup id: {backup_id}"}
Expand All @@ -232,7 +234,7 @@ def _on_list_backups_action(self, event) -> None:
# cannot list backups if pbm is resyncing, or has incompatible options or incorrect
# credentials
pbm_status = self.get_pbm_status()
self.charm.unit.status = pbm_status
self.charm.status.set_and_share_status(pbm_status)

if isinstance(pbm_status, WaitingStatus):
self._fail_action_with_error_log(
Expand Down Expand Up @@ -270,8 +272,8 @@ def _on_restore_action(self, event) -> None:
try:
backup_id = event.params.get("backup-id")
self._restore(backup_id, remapping_args=event.params.get("remap-pattern"))
self.charm.unit.status = MaintenanceStatus(
f"restore started/running, backup id:'{backup_id}'"
self.charm.status.set_and_share_status(
MaintenanceStatus(f"restore started/running, backup id:'{backup_id}'")
)
self._success_action_with_info_log(
event, action, {"restore-status": "restore started"}
Expand Down Expand Up @@ -301,7 +303,7 @@ def _restore_hook_checks(self, event) -> bool:
# cannot restore backup if pbm is not ready. This could be due to: resyncing, incompatible,
# options, incorrect credentials, creating a backup, or already performing a restore.
pbm_status = self.get_pbm_status()
self.charm.unit.status = pbm_status
self.charm.status.set_and_share_status(pbm_status)
if isinstance(pbm_status, MaintenanceStatus):
self._fail_action_with_error_log(
event, action, "Please wait for current backup/restore to finish."
Expand Down Expand Up @@ -372,33 +374,39 @@ def _configure_pbm_options(self, event) -> None:
self._set_config_options()
self._resync_config_options()
except SetPBMConfigError:
self.charm.unit.status = BlockedStatus("couldn't configure s3 backup options.")
self.charm.status.set_and_share_status(
BlockedStatus("couldn't configure s3 backup options.")
)
return
except snap.SnapError as e:
logger.error("An exception occurred when starting pbm agent, error: %s.", str(e))
self.charm.unit.status = BlockedStatus("couldn't start pbm")
self.charm.status.set_and_share_status(BlockedStatus("couldn't start pbm"))
return
except ResyncError:
self.charm.unit.status = WaitingStatus("waiting to sync s3 configurations.")
self.charm.status.set_and_share_status(
WaitingStatus("waiting to sync s3 configurations.")
)
self._defer_event_with_info_log(
event, action, "Sync-ing configurations needs more time."
)
return
except PBMBusyError:
self.charm.unit.status = WaitingStatus("waiting to sync s3 configurations.")
self.charm.status.set_and_share_status(
WaitingStatus("waiting to sync s3 configurations.")
)
self._defer_event_with_info_log(
event,
action,
"Cannot update configs while PBM is running, must wait for PBM action to finish.",
),
return
except ExecError as e:
self.charm.unit.status = BlockedStatus(self.process_pbm_error(e.stdout))
self.charm.status.set_and_share_status(BlockedStatus(self.process_pbm_error(e.stdout)))
return
except subprocess.CalledProcessError as e:
logger.error("Syncing configurations failed: %s", str(e))

self.charm.unit.status = self.get_pbm_status()
self.charm.status.set_and_share_status(self.get_pbm_status())

def _set_config_options(self):
"""Applying given configurations with pbm."""
Expand Down Expand Up @@ -490,12 +498,14 @@ def _wait_pbm_status(self) -> None:
if "Resync" in current_pbm_op(pbm_status):
# since this process takes several minutes we should let the user know
# immediately.
self.charm.unit.status = WaitingStatus(
"waiting to sync s3 configurations."
self.charm.status.set_and_share_status(
WaitingStatus("waiting to sync s3 configurations.")
)
raise ResyncError
except ExecError as e:
self.charm.unit.status = BlockedStatus(self.process_pbm_error(e.stdout))
self.charm.status.set_and_share_status(
BlockedStatus(self.process_pbm_error(e.stdout))
)

def get_pbm_status(self) -> Optional[StatusBase]:
"""Retrieve pbm status."""
Expand Down
14 changes: 9 additions & 5 deletions lib/charms/mongodb/v1/mongodb_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 5
LIBPATCH = 6

logger = logging.getLogger(__name__)
REL_NAME = "database"
Expand Down Expand Up @@ -98,7 +98,9 @@ def pass_hook_checks(self, event: EventBase) -> bool:

# legacy relations have auth disabled, which new relations require
if self.model.get_relation(LEGACY_REL_NAME):
self.charm.unit.status = BlockedStatus("cannot have both legacy and new relations")
self.charm.status.set_and_share_status(
BlockedStatus("cannot have both legacy and new relations")
)
logger.error("Auth disabled due to existing connections to legacy relations")
return False

Expand Down Expand Up @@ -130,9 +132,9 @@ def _on_relation_event(self, event):
# users have left and auth can be re-enabled.
if self.substrate == "vm" and not self.charm.auth_enabled():
logger.debug("Enabling authentication.")
self.charm.unit.status = MaintenanceStatus("re-enabling authentication")
self.charm.status.set_and_share_status(MaintenanceStatus("re-enabling authentication"))
self.charm.restart_charm_services(auth=True)
self.charm.unit.status = ActiveStatus()
self.charm.status.set_and_share_status(ActiveStatus())

departed_relation_id = None
if type(event) is RelationBrokenEvent:
Expand Down Expand Up @@ -182,7 +184,9 @@ def oversee_users(self, departed_relation_id: Optional[int], event):
# This hook gets called from other contexts within the charm so it is necessary to check
# for legacy relations which have auth disabled, which new relations require
if self.model.get_relation(LEGACY_REL_NAME):
self.charm.unit.status = BlockedStatus("cannot have both legacy and new relations")
self.charm.status.set_and_share_status(
BlockedStatus("cannot have both legacy and new relations")
)
logger.error("Auth disabled due to existing connections to legacy relations")
return

Expand Down
Loading

0 comments on commit ae2afe9

Please sign in to comment.