Skip to content

Commit

Permalink
use mongodb-k8s removal strategy + move to storage detached
Browse files Browse the repository at this point in the history
  • Loading branch information
MiaAltieri committed Oct 14, 2024
1 parent f0a7578 commit 2a9b2bf
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 19 deletions.
89 changes: 70 additions & 19 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
)

from config import Config
from exceptions import AdminUserCreationError, MissingSecretError
from exceptions import AdminUserCreationError, MissingSecretError, EarlyRemovalOfConfigServerError

logger = logging.getLogger(__name__)

Expand All @@ -85,6 +85,9 @@
UNIT_SCOPE = Config.Relations.UNIT_SCOPE
Scopes = Config.Relations.Scopes

ONE_HOUR = 3600
HALF_MINUTE = 30
ONE_MINUTE = 60
USER_CREATING_MAX_ATTEMPTS = 5
USER_CREATION_COOLDOWN = 30
REPLICA_SET_INIT_CHECK_TIMEOUT = 10
Expand Down Expand Up @@ -117,7 +120,7 @@ def __init__(self, *args):

self.framework.observe(self.on.get_password_action, self._on_get_password)
self.framework.observe(self.on.set_password_action, self._on_set_password)
self.framework.observe(self.on.stop, self._on_stop)
self.framework.observe(self.on.mongodb_storage_detaching, self.mongodb_storage_detaching)

self.framework.observe(self.on.secret_remove, self._on_secret_remove)
self.framework.observe(self.on.secret_changed, self._on_secret_changed)
Expand Down Expand Up @@ -691,10 +694,6 @@ def _relation_changes_handler(self, event: RelationEvent) -> None:
self._connect_mongodb_exporter()
self._connect_pbm_agent()

if isinstance(event, RelationDepartedEvent):
if event.departing_unit.name == self.unit.name:
self.unit_peer_data.setdefault("unit_departed", "True")

if not self.unit.is_leader():
return

Expand Down Expand Up @@ -759,19 +758,71 @@ def _reconcile_mongo_hosts_and_users(self, event: RelationEvent) -> None:
logger.info("Deferring reconfigure: error=%r", e)
event.defer()

def _on_stop(self, event) -> None:
if "True" == self.unit_peer_data.get("unit_departed", "False"):
logger.debug(f"{self.unit.name} blocking on_stop")
is_in_replica_set = True
timeout = UNIT_REMOVAL_TIMEOUT
while is_in_replica_set and timeout > 0:
is_in_replica_set = self.is_unit_in_replica_set()
time.sleep(1)
timeout -= 1
if timeout < 0:
raise Exception(f"{self.unit.name}.on_stop timeout exceeded")
logger.debug(f"{self.unit.name} releasing on_stop")
self.unit_peer_data["unit_departed"] = ""
def update_termination_grace_period(self, seconds: int) -> None:
# kubectl patch statefulset my-statefulset -p '{"spec": {"template": {"spec": {"terminationGracePeriodSeconds": 3600}, "metadata": {"annotations": {"force-update": "'$(date +%s)'"}}}}}'
pass

def mongodb_storage_detaching(self, event) -> None:
"""Before storage detaches, allow removing unit to remove itself from the set.
If the removing unit is primary also allow it to step down and elect another unit as
primary while it still has access to its storage.
"""

# self.update_termination_grace_period(ONE_HOUR)
# if time_left < ONE_MINUTE:
# time_left = (datetime.now() - start_time).seconds < 3600

if self.upgrade_in_progress:
# We cannot defer and prevent a user from removing a unit, log a warning instead.
logger.warning(
"Removing replicas during an upgrade is not supported. The charm may be in a broken, unrecoverable state"
)

# A single replica cannot step down as primary and we cannot reconfigure the replica set to
# have 0 members.
if self._is_removing_last_replica:
# removing config-server from a sharded cluster can be disaterous.
if self.is_role(Config.Role.CONFIG_SERVER) and self.config_server.has_shards():
current_shards = self.config_server.get_related_shards()
early_removal_message = f"Cannot remove config-server, still related to shards {', '.join(current_shards)}"
logger.error(early_removal_message)
# question: what happens in ks if you raise in storage detached? I assume the pod is still removed
raise EarlyRemovalOfConfigServerError(early_removal_message)

# cannot drain shard after storage detached.
if self.is_role(Config.Role.SHARD) and self.shard.has_config_server():
logger.info("Wait for shard to drain before detaching storage.")
self.status.set_and_share_status(MaintenanceStatus("Draining shard from cluster"))
mongos_hosts = self.shard.get_mongos_hosts()
# TODO need to update this function to attempt to patch the statefulset
self.shard.wait_for_draining(mongos_hosts)
logger.info("Shard successfully drained storage.")

self.update_termination_grace_period(HALF_MINUTE)
return

try:
# retries over a period of 10 minutes in an attempt to resolve race conditions it is
# not possible to defer in storage detached.
logger.debug("Removing %s from replica set", self.unit_host(self.unit))
for attempt in Retrying(
stop=stop_after_attempt(10),
wait=wait_fixed(1),
reraise=True,
):
with attempt:
# remove_replset_member retries for 60 seconds
with MongoDBConnection(self.mongodb_config) as mongo:
mongo.remove_replset_member(self.unit_host(self.unit))

except NotReadyError:
logger.info(
"Failed to remove %s from replica set, another member is syncing", self.unit.name
)
except PyMongoError as e:
logger.error("Failed to remove %s from replica set, error=%r", self.unit.name, e)

self.update_termination_grace_period(HALF_MINUTE)

def _on_update_status(self, event: UpdateStatusEvent):
# user-made mistakes might result in other incorrect statues. Prioritise informing users of
Expand Down
4 changes: 4 additions & 0 deletions src/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ class ApplicationHostNotFoundError(MongoError):
"""Raised when a queried host is not in the application peers or the current host."""


class EarlyRemovalOfConfigServerError(Exception):
"""Raised when there is an attempt to remove a config-server, while related to a shard."""


class MongoSecretError(MongoError):
"""Common parent for all Mongo Secret Exceptions."""

Expand Down

0 comments on commit 2a9b2bf

Please sign in to comment.