From cc52ce1daf1c721a02787bb26e54939be46c9072 Mon Sep 17 00:00:00 2001 From: Mia Altieri <32723809+MiaAltieri@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:38:39 +0200 Subject: [PATCH] [DPE-2763] POC Status reporting shard side (#275) ## Issue Statuses are not properly reported for shards in update status hook ## Solution Report statuses ## Testing ``` # deploy charms juju deploy ./*charm --config role="config-server" config-server-one juju deploy ./*charm --config role="shard" shard-one juju deploy ./*charm --config role="shard" shard-two # speed up frequency of status checks juju model-config update-status-hook-interval=10s # monitor `juju status --watch 1s` Unit Workload Agent Machine Public address Ports Message config-server-one/0* active idle 2 10.61.64.75 27017-27018/tcp Primary shard-one/0* blocked idle 0 10.61.64.126 27017/tcp missing relation to config server shard-two/0* blocked idle 1 10.61.64.216 27017/tcp missing relation to config server # relate application juju integrate config-server-one:config-server shard-one:sharding juju integrate config-server-one:config-server shard-two:sharding # monitor `juju status --watch 1s` Unit Workload Agent Machine Public address Ports Message config-server-one/0* active idle 2 10.61.64.75 27017-27018/tcp Primary shard-one/0* active idle 0 10.61.64.126 27017/tcp Shard connected to config-server: config-server-one shard-two/0* active idle 1 10.61.64.216 27017/tcp Shard connected to config-server: config-server-one # remove a relation to shard juju remove-relation config-server-one:config-server shard-two:sharding # monitor `juju status --watch 1s` config-server-one/0* active idle 0 10.61.64.50 27017-27018/tcp Primary shard-one/0* active idle 1 10.61.64.235 27017/tcp Shard connected to config-server: config-server-one shard-two/0* active idle 2 10.61.64.128 27017/tcp Shard drained from cluster, ready for removal # add non supported relation cd mongodb-operator/tests/integration/relation_tests/new_relations/application-charm charmcraft pack juju deploy ./*charm juju relation shard-one application # monitor `juju status --watch 1s` application/0* active idle 3 10.61.64.203 config-server-one/0* active idle 0 10.61.64.43 27017-27018/tcp shard-one/0* blocked idle 1 10.61.64.31 27017/tcp Sharding roles do not support mongodb_client interface. shard-two/0* active idle 2 10.61.64.239 27017/tcp ``` --- lib/charms/mongodb/v1/mongodb_provider.py | 37 +++-- .../{v0 => v1}/mongodb_vm_legacy_provider.py | 9 +- lib/charms/mongodb/v1/mongos.py | 34 ++++- lib/charms/mongodb/v1/shards_interface.py | 128 +++++++++++++++--- src/charm.py | 89 ++++++++++-- src/config.py | 1 + 6 files changed, 249 insertions(+), 49 deletions(-) rename lib/charms/mongodb/{v0 => v1}/mongodb_vm_legacy_provider.py (95%) diff --git a/lib/charms/mongodb/v1/mongodb_provider.py b/lib/charms/mongodb/v1/mongodb_provider.py index d0484a479..00e052b1d 100644 --- a/lib/charms/mongodb/v1/mongodb_provider.py +++ b/lib/charms/mongodb/v1/mongodb_provider.py @@ -29,7 +29,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 1 +LIBPATCH = 2 logger = logging.getLogger(__name__) REL_NAME = "database" @@ -82,6 +82,28 @@ def __init__(self, charm: CharmBase, substrate="k8s", relation_name: str = "data self.database_provides.on.database_requested, self._on_relation_event ) + def pass_hook_checks(self) -> bool: + """Runs the pre-hooks checks for MongoDBProvider, returns True if all pass.""" + if not self.charm.is_relation_feasible(self.relation_name): + logger.info("Skipping code for relations.") + return False + + # legacy relations have auth disabled, which new relations require + if self.model.get_relation(LEGACY_REL_NAME): + self.charm.unit.status = BlockedStatus("cannot have both legacy and new relations") + logger.error("Auth disabled due to existing connections to legacy relations") + return False + + if not self.charm.unit.is_leader(): + return False + + # We shouldn't try to create or update users if the database is not + # initialised. We will create users as part of initialisation. + if not self.charm.db_initialised: + return False + + return True + def _on_relation_event(self, event): """Handle relation joined events. @@ -90,17 +112,8 @@ def _on_relation_event(self, event): data. As a result, related charm gets credentials for accessing the MongoDB database. """ - if not self.charm.unit.is_leader(): - return - # We shouldn't try to create or update users if the database is not - # initialised. We will create users as part of initialisation. - if "db_initialised" not in self.charm.app_peer_data: - return - - # legacy relations have auth disabled, which new relations require - if self.model.get_relation(LEGACY_REL_NAME): - self.charm.unit.status = BlockedStatus("cannot have both legacy and new relations") - logger.error("Auth disabled due to existing connections to legacy relations") + if not self.pass_hook_checks(): + logger.info("Skipping %s: hook checks did not pass", type(event)) return # If auth is disabled but there are no legacy relation users, this means that legacy diff --git a/lib/charms/mongodb/v0/mongodb_vm_legacy_provider.py b/lib/charms/mongodb/v1/mongodb_vm_legacy_provider.py similarity index 95% rename from lib/charms/mongodb/v0/mongodb_vm_legacy_provider.py rename to lib/charms/mongodb/v1/mongodb_vm_legacy_provider.py index ef6db7e12..6eeb38600 100644 --- a/lib/charms/mongodb/v0/mongodb_vm_legacy_provider.py +++ b/lib/charms/mongodb/v1/mongodb_vm_legacy_provider.py @@ -17,11 +17,11 @@ LIBID = "896a48bc89b84d30839335bb37170509" # Increment this major API version when introducing breaking changes -LIBAPI = 0 +LIBAPI = 1 # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 4 +LIBPATCH = 0 logger = logging.getLogger(__name__) REL_NAME = "database" @@ -41,6 +41,7 @@ def __init__(self, charm): """Manager of MongoDB client relations.""" super().__init__(charm, "client-relations") self.charm = charm + self.relation_name = LEGACY_REL_NAME self.framework.observe( self.charm.on[LEGACY_REL_NAME].relation_created, self._on_legacy_relation_created ) @@ -64,6 +65,10 @@ def _on_legacy_relation_created(self, event): ) return + if not self.charm.is_relation_feasible(self.relation_name): + logger.info("Skipping code for legacy relations.") + return + # If auth is already disabled its likely it has a connection with another legacy relation # user. Shutting down and restarting mongod would lead to downtime for the other legacy # relation user and hence shouldn't be done. Not to mention there is no need to disable diff --git a/lib/charms/mongodb/v1/mongos.py b/lib/charms/mongodb/v1/mongos.py index a444d21fa..6bc9c85a9 100644 --- a/lib/charms/mongodb/v1/mongos.py +++ b/lib/charms/mongodb/v1/mongos.py @@ -9,7 +9,7 @@ from charms.mongodb.v0.mongodb import NotReadyError from pymongo import MongoClient, collection -from tenacity import Retrying, stop_after_delay, wait_fixed +from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed from config import Config @@ -21,7 +21,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 1 +LIBPATCH = 2 # path to store mongodb ketFile logger = logging.getLogger(__name__) @@ -334,6 +334,36 @@ def _log_removal_info(self, removal_info, shard_name): ",".join(dbs_to_move), ) + @property + def is_ready(self) -> bool: + """Is mongos ready for services requests. + + Returns: + True if services is ready False otherwise. Retries over a period of 60 seconds times to + allow server time to start up. + + Raises: + ConfigurationError, ConfigurationError, OperationFailure + """ + try: + for attempt in Retrying(stop=stop_after_delay(60), wait=wait_fixed(3)): + with attempt: + # The ping command is cheap and does not require auth. + self.client.admin.command("ping") + except RetryError: + return False + + return True + + def is_shard_aware(self, shard_name: str) -> bool: + """Returns True if provided shard is shard aware.""" + sc_status = self.client.admin.command("listShards") + for shard in sc_status["shards"]: + if shard["_id"] == shard_name: + return shard["state"] == 1 + + return False + def _retrieve_remaining_chunks(self, removal_info) -> int: """Parses the remaining chunks to remove from removeShard command.""" return removal_info["remaining"]["chunks"] if "remaining" in removal_info else 0 diff --git a/lib/charms/mongodb/v1/shards_interface.py b/lib/charms/mongodb/v1/shards_interface.py index b6098a5ea..6a053d30a 100644 --- a/lib/charms/mongodb/v1/shards_interface.py +++ b/lib/charms/mongodb/v1/shards_interface.py @@ -18,6 +18,7 @@ PyMongoError, ) from charms.mongodb.v1.helpers import KEY_FILE +from charms.mongodb.v1.mongodb_provider import LEGACY_REL_NAME, REL_NAME from charms.mongodb.v1.mongos import ( BalancerNotEnabledError, MongosConnection, @@ -28,7 +29,13 @@ from charms.mongodb.v1.users import MongoDBUser, OperatorUser from ops.charm import CharmBase, EventBase, RelationBrokenEvent from ops.framework import Object -from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, WaitingStatus +from ops.model import ( + ActiveStatus, + BlockedStatus, + MaintenanceStatus, + StatusBase, + WaitingStatus, +) from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed from config import Config @@ -44,7 +51,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 1 +LIBPATCH = 2 KEYFILE_KEY = "key-file" HOSTS_KEY = "host" OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username()) @@ -106,12 +113,8 @@ def _on_relation_joined(self, event): def pass_hook_checks(self, event: EventBase) -> bool: """Runs the pre-hooks checks for ShardingProvider, returns True if all pass.""" - if self.charm.is_role(Config.Role.REPLICATION): - self.charm.unit.status = BlockedStatus("role replication does not support sharding") - logger.error( - "Skipping %s. Sharding interface not supported with config role=replication.", - type(event), - ) + if not self.charm.is_relation_feasible(self.relation_name): + logger.info("Skipping event %s , relation not feasible.", type(event)) return False if not self.charm.is_role(Config.Role.CONFIG_SERVER): @@ -268,6 +271,10 @@ def update_mongos_hosts(self): for relation in self.charm.model.relations[self.relation_name]: self._update_relation_data(relation.id, {HOSTS_KEY: json.dumps(self.charm._unit_ips)}) + def get_config_server_status(self): + """TODO: Implement this function in a separate PR.""" + return None + def _update_relation_data(self, relation_id: int, data: dict) -> None: """Updates a set of key-value pairs in the relation. @@ -349,6 +356,12 @@ def _on_relation_changed(self, event): logger.info("Skipping relation joined event: hook checks re not passed") return + # if re-using an old shard, re-set drained flag. + if self.charm.unit.is_leader(): + self.charm.app_peer_data["drained"] = json.dumps(False) + + self.charm.unit.status = MaintenanceStatus("Adding shard to config-server") + # shards rely on the config server for secrets relation_data = event.relation.data[event.app] self.update_keyfile(key_file_contents=relation_data.get(KEYFILE_KEY)) @@ -361,8 +374,6 @@ def _on_relation_changed(self, event): event.defer() return - self.charm.unit.status = MaintenanceStatus("Adding shard to config-server") - if not self.charm.unit.is_leader(): return @@ -377,13 +388,12 @@ def _on_relation_changed(self, event): ) return - # TODO future PR, leader unit verifies shard was added to cluster (update-status hook) + self.charm.app_peer_data["added_to_cluster"] = json.dumps(True) def pass_hook_checks(self, event): """Runs the pre-hooks checks for ConfigServerRequirer, returns True if all pass.""" - if self.charm.is_role(Config.Role.REPLICATION): - self.charm.unit.status = BlockedStatus("role replication does not support sharding") - logger.error("sharding interface not supported with config role=replication") + if not self.charm.is_relation_feasible(self.relation_name): + logger.info("Skipping event %s , relation not feasible.", type(event)) return False if not self.charm.is_role(Config.Role.SHARD): @@ -426,8 +436,9 @@ def _on_relation_broken(self, event: RelationBrokenEvent) -> None: self.wait_for_draining(mongos_hosts) self.charm.unit.status = ActiveStatus("Shard drained from cluster, ready for removal") - # TODO future PR, leader unit displays this message in update-status hook - # TODO future PR, check for shard drainage when removing application + + if self.charm.unit.is_leader(): + self.charm.app_peer_data["added_to_cluster"] = json.dumps(False) def wait_for_draining(self, mongos_hosts: List[str]): """Waits for shards to be drained from sharded cluster.""" @@ -438,6 +449,7 @@ def wait_for_draining(self, mongos_hosts: List[str]): # no need to continuously check and abuse resources while shard is draining time.sleep(10) drained = self.drained(mongos_hosts, self.charm.app.name) + self.charm.unit.status = MaintenanceStatus("Draining shard from cluster") draining_status = ( "Shard is still draining" if not drained else "Shard is fully drained." ) @@ -459,6 +471,44 @@ def wait_for_draining(self, mongos_hosts: List[str]): break + def get_shard_status(self) -> Optional[StatusBase]: + """Returns the current status of the shard. + + Note: No need to report if currently draining, since that check block other hooks from + executing. + """ + if not self.charm.is_role(Config.Role.SHARD): + logger.info("skipping status check, charm is not running as a shard") + return None + + if not self.charm.db_initialised: + logger.info("No status for shard to report, waiting for db to be initialised.") + return None + + if self.model.get_relation(LEGACY_REL_NAME): + return BlockedStatus(f"relation {LEGACY_REL_NAME} to shard not supported.") + + if self.model.get_relation(REL_NAME): + return BlockedStatus(f"relation {REL_NAME} to shard not supported.") + + if not self.model.get_relation(self.relation_name) and not self.charm.drained: + return BlockedStatus("missing relation to config server") + + if not self.model.get_relation(self.relation_name) and self.charm.drained: + return ActiveStatus("Shard drained from cluster, ready for removal") + + if not self._is_mongos_reachable(): + return BlockedStatus("Config server unreachable") + + if not self._is_added_to_cluster(): + return MaintenanceStatus("Adding shard to config-server") + + if not self._is_shard_aware(): + return BlockedStatus("Shard is not yet shard aware") + + config_server_name = self.get_related_config_server() + return ActiveStatus(f"Shard connected to config-server: {config_server_name}") + def drained(self, mongos_hosts: Set[str], shard_name: str) -> bool: """Returns whether a shard has been drained from the cluster. @@ -564,16 +614,58 @@ def _update_relation_data(self, relation_id: int, data: dict) -> None: if relation: relation.data[self.charm.model.app].update(data) + def _is_mongos_reachable(self) -> bool: + """Returns True if mongos is reachable.""" + if not self.model.get_relation(self.relation_name): + logger.info("Mongos is not reachable, no relation to config-sever") + return False + + mongos_hosts = self.get_mongos_hosts() + if not mongos_hosts: + return False + + self.charm.remote_mongos_config(set(mongos_hosts)) + config = self.charm.remote_mongos_config(set(mongos_hosts)) + + # use a URI that is not dependent on the operator password, as we are not guaranteed that + # the shard has received the password yet. + uri = f"mongodb://{','.join(mongos_hosts)}" + with MongosConnection(config, uri) as mongo: + return mongo.is_ready + + def _is_added_to_cluster(self) -> bool: + """Returns True if the shard has been added to the cluster.""" + return json.loads(self.charm.app_peer_data.get("added_to_cluster", "False")) + + def _is_shard_aware(self) -> bool: + """Returns True if shard is in cluster and shard aware.""" + if not self.model.get_relation(self.relation_name): + logger.info( + "Mongos is not reachable, no relation to config-sever, cannot check shard status." + ) + return False + + mongos_hosts = self.get_mongos_hosts() + with MongosConnection(self.charm.remote_mongos_config(set(mongos_hosts))) as mongo: + return mongo.is_shard_aware(shard_name=self.charm.app.name) + def has_config_server(self) -> bool: """Returns True if currently related to config server.""" return len(self.charm.model.relations[self.relation_name]) > 0 - def get_related_config_server(self) -> List[str]: + def get_related_config_server(self) -> str: """Returns the related config server.""" - return [rel.app.name for rel in self.charm.model.relations[self.relation_name]] + if self.relation_name not in self.charm.model.relations: + return None + + # metadata.yaml prevents having multiple config servers + return self.charm.model.relations[self.relation_name][0].app.name def get_mongos_hosts(self) -> List[str]: """Returns a list of IP addresses for the mongos hosts.""" # only one related config-server is possible config_server_relation = self.charm.model.relations[self.relation_name][0] + if HOSTS_KEY not in config_server_relation.data[config_server_relation.app]: + return + return json.loads(config_server_relation.data[config_server_relation.app].get(HOSTS_KEY)) diff --git a/src/charm.py b/src/charm.py index 909330b89..b2e5fae5f 100755 --- a/src/charm.py +++ b/src/charm.py @@ -20,7 +20,6 @@ ) from charms.mongodb.v0.mongodb_secrets import SecretCache, generate_secret_label from charms.mongodb.v0.mongodb_tls import MongoDBTLS -from charms.mongodb.v0.mongodb_vm_legacy_provider import MongoDBLegacyProvider from charms.mongodb.v1.helpers import ( KEY_FILE, TLS_EXT_CA_FILE, @@ -35,6 +34,7 @@ ) from charms.mongodb.v1.mongodb_backups import S3_RELATION, MongoDBBackups from charms.mongodb.v1.mongodb_provider import MongoDBProvider +from charms.mongodb.v1.mongodb_vm_legacy_provider import MongoDBLegacyProvider from charms.mongodb.v1.mongos import MongosConfiguration from charms.mongodb.v1.shards_interface import ConfigServerRequirer, ShardingProvider from charms.mongodb.v1.users import ( @@ -66,6 +66,7 @@ BlockedStatus, MaintenanceStatus, Relation, + StatusBase, Unit, WaitingStatus, ) @@ -574,20 +575,7 @@ def _on_update_status(self, event: UpdateStatusEvent): if self.unit.is_leader(): self._handle_reconfigure(event) - # update the units status based on it's replica set config and backup status. An error in - # the status of MongoDB takes precedence over pbm status. - mongodb_status = build_unit_status(self.mongodb_config, self._unit_ip(self.unit)) - pbm_status = self.backups.get_pbm_status() - if ( - not isinstance(mongodb_status, ActiveStatus) - or not self.model.get_relation( - S3_RELATION - ) # if s3 relation doesn't exist only report MongoDB status - or isinstance(pbm_status, ActiveStatus) # pbm is ready then report the MongoDB status - ): - self.unit.status = mongodb_status - else: - self.unit.status = pbm_status + self.unit.status = self.get_status() def _on_get_primary_action(self, event: ActionEvent): event.set_results({"replica-set-primary": self._primary}) @@ -1361,6 +1349,77 @@ def _is_removing_last_replica(self) -> bool: """Returns True if the last replica (juju unit) is getting removed.""" return self.app.planned_units() == 0 and len(self._peers.units) == 0 + def get_status(self) -> StatusBase: + """Returns the status with the highest priority from backups, sharding, and mongod. + + Note: it will never be the case that shard_status and config_server_status are both present + since the mongodb app can either be a shard or a config server, but not both. + """ + # retrieve statuses of different services running on Charmed MongoDB + mongodb_status = build_unit_status(self.mongodb_config, self._unit_ip(self.unit)) + shard_status = self.shard.get_shard_status() if self.is_role(Config.Role.SHARD) else None + config_server_status = ( + self.config_server.get_config_server_status() + if self.is_role(Config.Role.CONFIG_SERVER) + else None + ) + pbm_status = ( + self.backups.get_pbm_status() if self.model.get_relation(S3_RELATION) else None + ) + + # failure in mongodb takes precedence over sharding and config server + if not isinstance(mongodb_status, ActiveStatus): + return mongodb_status + + if shard_status and not isinstance(shard_status, ActiveStatus): + return shard_status + + if config_server_status and not isinstance(config_server_status, ActiveStatus): + return config_server_status + + if pbm_status and not isinstance(pbm_status, ActiveStatus): + return pbm_status + + # if all statuses are active report sharding statuses over mongodb status + if isinstance(shard_status, ActiveStatus): + return shard_status + + if isinstance(config_server_status, ActiveStatus): + return config_server_status + + return mongodb_status + + def is_relation_feasible(self, rel_interface) -> bool: + """Returns true if the proposed relation is feasible.""" + if self.is_sharding_component() and rel_interface in Config.Relations.DB_RELATIONS: + self.unit.status = BlockedStatus( + f"Sharding roles do not support {rel_interface} interface." + ) + logger.error( + "Charm is in sharding role: %s. Does not support %s interface.", + rel_interface, + self.role, + ) + return False + + if ( + not self.is_sharding_component() + and rel_interface == Config.Relations.SHARDING_RELATIONS_NAME + ): + self.unit.status = BlockedStatus("role replication does not support sharding") + logger.error( + "Charm is in sharding role: %s. Does not support %s interface.", + self.role, + rel_interface, + ) + return False + + return True + + def is_sharding_component(self) -> bool: + """Returns true if charm is running as a sharded component.""" + return self.is_role(Config.Role.SHARD) or self.is_role(Config.Role.CONFIG_SERVER) + # END: helper functions diff --git a/src/config.py b/src/config.py index 073065bee..a00d8b4d4 100644 --- a/src/config.py +++ b/src/config.py @@ -76,6 +76,7 @@ class Relations: CONFIG_SERVER_RELATIONS_NAME = "config-server" APP_SCOPE = "app" UNIT_SCOPE = "unit" + DB_RELATIONS = [OBSOLETE_RELATIONS_NAME, NAME] Scopes = Literal[APP_SCOPE, UNIT_SCOPE] class Secrets: