Skip to content

Commit

Permalink
patch stateful set
Browse files Browse the repository at this point in the history
  • Loading branch information
MiaAltieri committed Oct 15, 2024
1 parent 9c88b59 commit fc992f0
Showing 1 changed file with 76 additions and 13 deletions.
89 changes: 76 additions & 13 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import json
import logging
import re
import time
from typing import Any, Dict, List, Optional, Set

import jinja2
Expand Down Expand Up @@ -41,6 +42,9 @@
CrossAppVersionChecker,
get_charm_revision,
)
from lightkube import Client
from lightkube.resources.apps_v1 import StatefulSet
from lightkube.types import PatchType
from ops.charm import (
ActionEvent,
CharmBase,
Expand Down Expand Up @@ -88,9 +92,8 @@
UNIT_SCOPE = Config.Relations.UNIT_SCOPE
Scopes = Config.Relations.Scopes

ONE_HOUR = 3600
HALF_MINUTE = 30
ONE_MINUTE = 60
ONE_YEAR = 31540000
USER_CREATING_MAX_ATTEMPTS = 5
USER_CREATION_COOLDOWN = 30
REPLICA_SET_INIT_CHECK_TIMEOUT = 10
Expand All @@ -101,10 +104,10 @@ class MongoDBCharm(CharmBase):

def __init__(self, *args):
super().__init__(*args)

self.framework.observe(self.on.mongod_pebble_ready, self._on_mongod_pebble_ready)
self.framework.observe(self.on.config_changed, self._on_config_changed)
self.framework.observe(self.on.start, self._on_start)
self.framework.observe(self.on.stop, self._on_stop)
self.framework.observe(self.on.update_status, self._on_update_status)
self.framework.observe(
self.on[Config.Relations.PEERS].relation_joined, self._relation_changes_handler
Expand Down Expand Up @@ -559,6 +562,7 @@ def _compare_secret_ids(secret_id1: str, secret_id2: str) -> bool:
return False

# BEGIN: charm events

def _on_mongod_pebble_ready(self, event) -> None:
"""Configure MongoDB pebble layer specification."""
# Get a reference the container attribute
Expand Down Expand Up @@ -656,6 +660,19 @@ def _on_start(self, event) -> None:
It is needed to install mongodb-clients inside the charm container
to make this function work correctly.
"""
# Patch the stateful set to have an increased termination period to prevent data loss on
# removed shards. As Juju gives us a termination period of 30 seconds:
# https://bugs.launchpad.net/juju/+bug/2035102

# It doesn't matter if we patch the stateful set before or after the charm has started.
# The usual start hooks emitted by juju will have already been emitted, so we can expect
# two rounds of restarts on one or more units (some units that get initialised late will
# only have one round of restarts). The second round of start hooks will be emitted
# **only after the replica set has been initialized**, we have 0 control over that.

if self.unit.is_leader() and self.get_current_termination_period() != ONE_YEAR:
self.update_termination_grace_period(ONE_YEAR)

container = self.unit.get_container(Config.CONTAINER_NAME)
if not container.can_connect():
logger.debug("mongod container is not ready yet.")
Expand Down Expand Up @@ -765,20 +782,42 @@ def _reconcile_mongo_hosts_and_users(self, event: RelationEvent) -> None:
logger.info("Deferring reconfigure: error=%r", e)
event.defer()

def get_current_termination_period(self) -> int:
"""Returns the current termination period for the stateful set of this juju application."""
client = Client()
statefulset = client.get(StatefulSet, name=self.app.name, namespace=self.model.name)
return statefulset.spec.template.spec.terminationGracePeriodSeconds

def update_termination_grace_period(self, seconds: int) -> None:
"""Patch the termination grace period for the stateful set."""
pass
"""Patch the termination grace period for the stateful set of this juju application."""
# updating the termination grace period is only useful for shards, whose sudden removal
# can result in data-loss
if not self.is_role(Config.Role.SHARD):
return

client = Client()
patch_data = {
"spec": {
"template": {
"spec": {"terminationGracePeriodSeconds": ONE_YEAR},
"metadata": {"annotations": {"force-update": str(int(time.time()))}},
}
}
}
client.patch(
StatefulSet,
name=self.app.name,
namespace=self.model.name,
obj=patch_data,
patch_type=PatchType.MERGE,
)

def mongodb_storage_detaching(self, event) -> None:
"""Before storage detaches, allow removing unit to remove itself from the set.
If the removing unit is primary also allow it to step down and elect another unit as
primary while it still has access to its storage.
"""
# self.update_termination_grace_period(ONE_HOUR)
# if time_left < ONE_MINUTE:
# time_left = (datetime.now() - start_time).seconds < 3600

if self.upgrade_in_progress:
# We cannot defer and prevent a user from removing a unit, log a warning instead.
logger.warning(
Expand Down Expand Up @@ -806,9 +845,6 @@ def mongodb_storage_detaching(self, event) -> None:
self.shard.wait_for_draining(mongos_hosts)
logger.info("Shard successfully drained storage.")

self.update_termination_grace_period(HALF_MINUTE)
return

try:
# retries over a period of 10 minutes in an attempt to resolve race conditions it is
# not possible to defer in storage detached.
Expand All @@ -830,7 +866,23 @@ def mongodb_storage_detaching(self, event) -> None:
except PyMongoError as e:
logger.error("Failed to remove %s from replica set, error=%r", self.unit.name, e)

self.update_termination_grace_period(HALF_MINUTE)
def _on_stop(self, _) -> None:
"""Handle on_stop event.
On stop can occur after a user has refreshed, after a unit has been removed, or when a pod
is getting restarted.
"""
# I can add this functionality to mongodb lib - i.e. a function wait_for_new_primary, but
# this is just a POC
waiting = 0
while (
self.unit.name == self.primary and len(self.peers_units) > 1 and waiting < ONE_MINUTE
):
logger.debug("Stepping down current primary, before stopping.")
with MongoDBConnection(self.mongodb_config) as mongo:
mongo.step_down_primary()
time.sleep(1)
waiting += 1

def _on_update_status(self, event: UpdateStatusEvent):
# user-made mistakes might result in other incorrect statues. Prioritise informing users of
Expand Down Expand Up @@ -866,6 +918,17 @@ def _on_update_status(self, event: UpdateStatusEvent):

self.status.set_and_share_status(self.status.process_statuses())

# We must ensure that juju does not overwrite our termination period, so we should update
# it as needed. However, updating the termination period can result in an onslaught of
# events, including the upgrade event. To prevent this from messing with upgrades do not
# update the termination period when an upgrade is occurring.
if (
self.unit.is_leader()
and self.get_current_termination_period() != ONE_YEAR
and not self.upgrade_in_progress
):
self.update_termination_grace_period(ONE_YEAR)

# END: charm events

# BEGIN: actions
Expand Down

0 comments on commit fc992f0

Please sign in to comment.