From e1223e9cc91683c7f12661f17156a83c9f068f5d Mon Sep 17 00:00:00 2001 From: Mia Altieri <32723809+MiaAltieri@users.noreply.github.com> Date: Mon, 18 Mar 2024 11:50:38 +0100 Subject: [PATCH] [DPE-3609] Fix rotate cluster + replicaset password (#378) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Issue Rotating operator password for on MongoDB cluster or replica set results in “flickering” errors on update_status ## Solution Fix this by adding extra processing that checks if passwords + tls settings are consistent across the cluster / replica set. --- lib/charms/mongodb/v1/shards_interface.py | 18 +++--- src/charm.py | 61 +++++++++++++------ .../sharding_tests/test_sharding.py | 42 ++++++++++++- .../sharding_tests/test_sharding_backups.py | 12 +--- 4 files changed, 94 insertions(+), 39 deletions(-) diff --git a/lib/charms/mongodb/v1/shards_interface.py b/lib/charms/mongodb/v1/shards_interface.py index ebd64b747..4203b5f08 100644 --- a/lib/charms/mongodb/v1/shards_interface.py +++ b/lib/charms/mongodb/v1/shards_interface.py @@ -56,7 +56,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 10 +LIBPATCH = 11 KEYFILE_KEY = "key-file" HOSTS_KEY = "host" OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username()) @@ -64,6 +64,8 @@ INT_TLS_CA_KEY = f"int-{Config.TLS.SECRET_CA_LABEL}" FORBIDDEN_REMOVAL_ERR_CODE = 20 AUTH_FAILED_CODE = 18 +UNAUTHORISED_CODE = 13 +TLS_CANNOT_FIND_PRIMARY = 133 class ShardAuthError(Exception): @@ -471,7 +473,7 @@ def cluster_password_synced(self) -> bool: with MongoDBConnection(self.charm.mongodb_config) as mongod: mongod.get_replset_status() except OperationFailure as e: - if e.code == 18: # Unauthorized Error - i.e. password is not in sync + if e.code in [UNAUTHORISED_CODE, AUTH_FAILED_CODE]: return False raise except ServerSelectionTimeoutError: @@ -999,9 +1001,10 @@ def _is_added_to_cluster(self) -> bool: return self.charm.app.name in cluster_shards except OperationFailure as e: if e.code in [ - 13, - 18, - ]: # [Unauthorized, AuthenticationFailed ]we are not yet connected to mongos + UNAUTHORISED_CODE, + AUTH_FAILED_CODE, + TLS_CANNOT_FIND_PRIMARY, + ]: return False raise @@ -1026,10 +1029,7 @@ def cluster_password_synced(self) -> bool: with MongoDBConnection(self.charm.mongodb_config) as mongo: mongod_reachable = mongo.is_ready except OperationFailure as e: - if e.code in [ - 13, - 18, - ]: # [Unauthorized, AuthenticationFailed ]we are not yet connected to mongos + if e.code in [UNAUTHORISED_CODE, AUTH_FAILED_CODE, TLS_CANNOT_FIND_PRIMARY]: return False raise except ServerSelectionTimeoutError: diff --git a/src/charm.py b/src/charm.py index b07f02879..618a8929a 100755 --- a/src/charm.py +++ b/src/charm.py @@ -9,7 +9,7 @@ import subprocess import time from pathlib import Path -from typing import Dict, List, Optional, Set +from typing import Dict, List, Optional, Set, Tuple from charms.grafana_agent.v0.cos_agent import COSAgentProvider from charms.mongodb.v0.config_server_interface import ClusterProvider @@ -72,12 +72,17 @@ Unit, WaitingStatus, ) +from pymongo.errors import OperationFailure, ServerSelectionTimeoutError from tenacity import Retrying, before_log, retry, stop_after_attempt, wait_fixed from config import Config from exceptions import AdminUserCreationError, ApplicationHostNotFoundError from machine_helpers import MONGO_USER, ROOT_USER_GID, update_mongod_service +AUTH_FAILED_CODE = 18 +UNAUTHORISED_CODE = 13 +TLS_CANNOT_FIND_PRIMARY = 133 + logger = logging.getLogger(__name__) APP_SCOPE = Config.Relations.APP_SCOPE @@ -556,24 +561,13 @@ def _on_update_status(self, event: UpdateStatusEvent): self.unit.status = WaitingStatus("Waiting for MongoDB to start") return - # Cannot check more advanced MongoDB statuses if the cluster doesn't have passwords synced - # this can occur in two cases: - # 1. password rotation - # 2. race conditions when a new shard is addeded. - if ( - not self.shard.cluster_password_synced() - or not self.config_server.cluster_password_synced() - ): - self.unit.status = WaitingStatus("Waiting to sync passwords across the cluster") - return - # leader should periodically handle configuring the replica set. Incidents such as network # cuts can lead to new IP addresses and therefore will require a reconfigure. Especially # in the case that the leader a change in IP address it will not receive a relation event. if self.unit.is_leader(): self._handle_reconfigure(event) - self.unit.status = self.get_status() + self.unit.status = self.process_statuses() def _on_get_primary_action(self, event: ActionEvent): event.set_results({"replica-set-primary": self.primary}) @@ -1406,18 +1400,17 @@ def get_invalid_integration_status(self) -> Optional[StatusBase]: "Relation to s3-integrator is not supported, config role must be config-server" ) - def get_status(self) -> StatusBase: - """Returns the status with the highest priority from backups, sharding, and mongod. - - Note: it will never be the case that shard_status and config_server_status are both present - since the mongodb app can either be a shard or a config server, but not both. - """ - # retrieve statuses of different services running on Charmed MongoDB + def get_statuses(self) -> Tuple: + """Retrieves statuses for the different processes running inside the unit.""" mongodb_status = build_unit_status(self.mongodb_config, self._unit_ip(self.unit)) shard_status = self.shard.get_shard_status() config_server_status = self.config_server.get_config_server_status() pbm_status = self.backups.get_pbm_status() + return (mongodb_status, shard_status, config_server_status, pbm_status) + def prioritize_statuses(self, statuses: Tuple) -> StatusBase: + """Returns the status with the highest priority from backups, sharding, and mongod.""" + mongodb_status, shard_status, config_server_status, pbm_status = statuses # failure in mongodb takes precedence over sharding and config server if not isinstance(mongodb_status, ActiveStatus): return mongodb_status @@ -1434,6 +1427,34 @@ def get_status(self) -> StatusBase: # if all statuses are active report mongodb status over sharding status return mongodb_status + def process_statuses(self) -> StatusBase: + """Retrieves statuses from processes inside charm and returns the highest priority status. + + When a non-fatal error occurs while processing statuses, the error is processed and + returned as a statuses. + """ + # retrieve statuses of different services running on Charmed MongoDB + deployment_mode = "replica set" if self.is_role(Config.Role.REPLICATION) else "cluster" + waiting_status = None + try: + statuses = self.get_statuses() + except OperationFailure as e: + if e.code in [UNAUTHORISED_CODE, AUTH_FAILED_CODE]: + waiting_status = f"Waiting to sync passwords across the {deployment_mode}" + elif e.code == TLS_CANNOT_FIND_PRIMARY: + waiting_status = ( + f"Waiting to sync internal membership across the {deployment_mode}" + ) + else: + raise + except ServerSelectionTimeoutError: + waiting_status = f"Waiting to sync internal membership across the {deployment_mode}" + + if waiting_status: + return WaitingStatus(waiting_status) + + return self.prioritize_statuses(statuses) + def is_relation_feasible(self, rel_interface) -> bool: """Returns true if the proposed relation is feasible.""" if self.is_sharding_component() and rel_interface in Config.Relations.DB_RELATIONS: diff --git a/tests/integration/sharding_tests/test_sharding.py b/tests/integration/sharding_tests/test_sharding.py index 28b15a3b1..462ea8e1c 100644 --- a/tests/integration/sharding_tests/test_sharding.py +++ b/tests/integration/sharding_tests/test_sharding.py @@ -6,6 +6,7 @@ import pytest from pytest_operator.plugin import OpsTest +from ..helpers import get_leader_id, get_password, set_password from .helpers import ( generate_mongodb_client, has_correct_shards, @@ -17,10 +18,18 @@ SHARD_ONE_APP_NAME = "shard-one" SHARD_TWO_APP_NAME = "shard-two" SHARD_THREE_APP_NAME = "shard-three" -SHARD_APPS = [SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME] +SHARD_APPS = [SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME, SHARD_THREE_APP_NAME] CONFIG_SERVER_APP_NAME = "config-server-one" +CLUSTER_APPS = [ + CONFIG_SERVER_APP_NAME, + SHARD_ONE_APP_NAME, + SHARD_TWO_APP_NAME, + SHARD_THREE_APP_NAME, +] SHARD_REL_NAME = "sharding" CONFIG_SERVER_REL_NAME = "config-server" +OPERATOR_USERNAME = "operator" +OPERATOR_PASSWORD = "operator-password" MONGODB_KEYFILE_PATH = "/var/snap/charmed-mongodb/current/etc/mongod/keyFile" # for now we have a large timeout due to the slow drainage of the `config.system.sessions` # collection. More info here: @@ -122,6 +131,37 @@ async def test_cluster_active(ops_test: OpsTest) -> None: ), "Config server did not process config properly" +@pytest.mark.group(1) +async def test_set_operator_password(ops_test: OpsTest): + """Tests that the cluster can safely set the operator password.""" + for cluster_app_name in CLUSTER_APPS: + operator_password = await get_password( + ops_test, username=OPERATOR_USERNAME, app_name=cluster_app_name + ) + assert ( + operator_password != OPERATOR_PASSWORD + ), f"{cluster_app_name} is incorrectly already set to the new password." + + # rotate password and verify that no unit goes into error as a result of password rotation + config_leader_id = await get_leader_id(ops_test, app_name=CONFIG_SERVER_APP_NAME) + await set_password( + ops_test, unit_id=config_leader_id, username=OPERATOR_USERNAME, password=OPERATOR_PASSWORD + ) + await ops_test.model.wait_for_idle( + apps=CLUSTER_APPS, + status="active", + idle_period=20, + ), + + for cluster_app_name in CLUSTER_APPS: + operator_password = await get_password( + ops_test, username=OPERATOR_USERNAME, app_name=cluster_app_name + ) + assert ( + operator_password == OPERATOR_PASSWORD + ), f"{cluster_app_name} did not rotate to new password." + + @pytest.mark.group(1) @pytest.mark.abort_on_fail async def test_sharding(ops_test: OpsTest) -> None: diff --git a/tests/integration/sharding_tests/test_sharding_backups.py b/tests/integration/sharding_tests/test_sharding_backups.py index 65db8f6e0..cf70b06b8 100644 --- a/tests/integration/sharding_tests/test_sharding_backups.py +++ b/tests/integration/sharding_tests/test_sharding_backups.py @@ -274,14 +274,10 @@ async def test_restore_backup(ops_test: OpsTest, add_writes_to_shards) -> None: async def test_migrate_restore_backup(ops_test: OpsTest, add_writes_to_shards) -> None: """Tests that sharded Charmed MongoDB cluster supports restores.""" config_leader_id = await get_leader_id(ops_test, app_name=CONFIG_SERVER_APP_NAME) - # TODO Future Work - determine the source of the temporary error state that occurs during the - # rotation of the operator user. await set_password( ops_test, unit_id=config_leader_id, username="operator", password=OPERATOR_PASSWORD ) - await ops_test.model.wait_for_idle( - apps=CLUSTER_APPS, status="active", idle_period=20, raise_on_error=False - ), + await ops_test.model.wait_for_idle(apps=CLUSTER_APPS, status="active", idle_period=20) # count total writes cluster_writes = await writes_helpers.get_cluster_writes_count( @@ -326,14 +322,12 @@ async def test_migrate_restore_backup(ops_test: OpsTest, add_writes_to_shards) - await deploy_cluster_backup_test(ops_test, deploy_s3_integrator=False) await setup_cluster_and_s3(ops_test) config_leader_id = await get_leader_id(ops_test, app_name=CONFIG_SERVER_APP_NAME) - # TODO Future Work - determine the source of the temporary error state that occurs during the - # rotation of the operator user. await set_password( ops_test, unit_id=config_leader_id, username="operator", password=OPERATOR_PASSWORD ) await ops_test.model.wait_for_idle( - apps=CLUSTER_APPS, status="active", idle_period=20, timeout=TIMEOUT, raise_on_error=False - ), + apps=CLUSTER_APPS, status="active", idle_period=20, timeout=TIMEOUT + ) # find most recent backup id and restore leader_unit = await backup_helpers.get_leader_unit(