diff --git a/src/charm.py b/src/charm.py index b9790ebde..0a2ae47ec 100755 --- a/src/charm.py +++ b/src/charm.py @@ -5,7 +5,6 @@ import json import logging import re -import time from typing import Any, Dict, List, Optional, Set import jinja2 @@ -75,7 +74,11 @@ ) from config import Config -from exceptions import AdminUserCreationError, MissingSecretError +from exceptions import ( + AdminUserCreationError, + EarlyRemovalOfConfigServerError, + MissingSecretError, +) logger = logging.getLogger(__name__) @@ -85,6 +88,9 @@ UNIT_SCOPE = Config.Relations.UNIT_SCOPE Scopes = Config.Relations.Scopes +ONE_HOUR = 3600 +HALF_MINUTE = 30 +ONE_MINUTE = 60 USER_CREATING_MAX_ATTEMPTS = 5 USER_CREATION_COOLDOWN = 30 REPLICA_SET_INIT_CHECK_TIMEOUT = 10 @@ -117,7 +123,7 @@ def __init__(self, *args): self.framework.observe(self.on.get_password_action, self._on_get_password) self.framework.observe(self.on.set_password_action, self._on_set_password) - self.framework.observe(self.on.stop, self._on_stop) + self.framework.observe(self.on.mongodb_storage_detaching, self.mongodb_storage_detaching) self.framework.observe(self.on.secret_remove, self._on_secret_remove) self.framework.observe(self.on.secret_changed, self._on_secret_changed) @@ -153,6 +159,10 @@ def __init__(self, *args): ) # BEGIN: properties + @property + def _is_removing_last_replica(self) -> bool: + """Returns True if the last replica (juju unit) is getting removed.""" + return self.app.planned_units() == 0 and len(self.peers_units) == 0 @property def monitoring_jobs(self) -> list[dict[str, Any]]: @@ -691,10 +701,6 @@ def _relation_changes_handler(self, event: RelationEvent) -> None: self._connect_mongodb_exporter() self._connect_pbm_agent() - if isinstance(event, RelationDepartedEvent): - if event.departing_unit.name == self.unit.name: - self.unit_peer_data.setdefault("unit_departed", "True") - if not self.unit.is_leader(): return @@ -759,19 +765,72 @@ def _reconcile_mongo_hosts_and_users(self, event: RelationEvent) -> None: logger.info("Deferring reconfigure: error=%r", e) event.defer() - def _on_stop(self, event) -> None: - if "True" == self.unit_peer_data.get("unit_departed", "False"): - logger.debug(f"{self.unit.name} blocking on_stop") - is_in_replica_set = True - timeout = UNIT_REMOVAL_TIMEOUT - while is_in_replica_set and timeout > 0: - is_in_replica_set = self.is_unit_in_replica_set() - time.sleep(1) - timeout -= 1 - if timeout < 0: - raise Exception(f"{self.unit.name}.on_stop timeout exceeded") - logger.debug(f"{self.unit.name} releasing on_stop") - self.unit_peer_data["unit_departed"] = "" + def update_termination_grace_period(self, seconds: int) -> None: + """Patch the termination grace period for the stateful set.""" + pass + + def mongodb_storage_detaching(self, event) -> None: + """Before storage detaches, allow removing unit to remove itself from the set. + + If the removing unit is primary also allow it to step down and elect another unit as + primary while it still has access to its storage. + """ + # self.update_termination_grace_period(ONE_HOUR) + # if time_left < ONE_MINUTE: + # time_left = (datetime.now() - start_time).seconds < 3600 + + if self.upgrade_in_progress: + # We cannot defer and prevent a user from removing a unit, log a warning instead. + logger.warning( + "Removing replicas during an upgrade is not supported. The charm may be in a broken, unrecoverable state" + ) + + # A single replica cannot step down as primary and we cannot reconfigure the replica set to + # have 0 members. + if self._is_removing_last_replica: + # removing config-server from a sharded cluster can be disaterous. + if self.is_role(Config.Role.CONFIG_SERVER) and self.config_server.has_shards(): + current_shards = self.config_server.get_related_shards() + early_removal_message = f"Cannot remove config-server, still related to shards {', '.join(current_shards)}" + logger.error(early_removal_message) + # question: what happens in ks if you raise in storage detached? I assume the pod + # is still removed + raise EarlyRemovalOfConfigServerError(early_removal_message) + + # cannot drain shard after storage detached. + if self.is_role(Config.Role.SHARD) and self.shard.has_config_server(): + logger.info("Wait for shard to drain before detaching storage.") + self.status.set_and_share_status(MaintenanceStatus("Draining shard from cluster")) + mongos_hosts = self.shard.get_mongos_hosts() + # TODO need to update this function to attempt to patch the statefulset + self.shard.wait_for_draining(mongos_hosts) + logger.info("Shard successfully drained storage.") + + self.update_termination_grace_period(HALF_MINUTE) + return + + try: + # retries over a period of 10 minutes in an attempt to resolve race conditions it is + # not possible to defer in storage detached. + logger.debug("Removing %s from replica set", self.unit_host(self.unit)) + for attempt in Retrying( + stop=stop_after_attempt(10), + wait=wait_fixed(1), + reraise=True, + ): + with attempt: + # remove_replset_member retries for 60 seconds + with MongoDBConnection(self.mongodb_config) as mongo: + mongo.remove_replset_member(self.unit_host(self.unit)) + + except NotReadyError: + logger.info( + "Failed to remove %s from replica set, another member is syncing", self.unit.name + ) + except PyMongoError as e: + logger.error("Failed to remove %s from replica set, error=%r", self.unit.name, e) + + self.update_termination_grace_period(HALF_MINUTE) def _on_update_status(self, event: UpdateStatusEvent): # user-made mistakes might result in other incorrect statues. Prioritise informing users of diff --git a/src/exceptions.py b/src/exceptions.py index 946d7d73b..565bd14fc 100644 --- a/src/exceptions.py +++ b/src/exceptions.py @@ -16,6 +16,10 @@ class ApplicationHostNotFoundError(MongoError): """Raised when a queried host is not in the application peers or the current host.""" +class EarlyRemovalOfConfigServerError(Exception): + """Raised when there is an attempt to remove a config-server, while related to a shard.""" + + class MongoSecretError(MongoError): """Common parent for all Mongo Secret Exceptions."""