From e1223e9cc91683c7f12661f17156a83c9f068f5d Mon Sep 17 00:00:00 2001
From: Mia Altieri <32723809+MiaAltieri@users.noreply.github.com>
Date: Mon, 18 Mar 2024 11:50:38 +0100
Subject: [PATCH] [DPE-3609] Fix rotate cluster + replicaset password (#378)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Issue
Rotating operator password for on MongoDB cluster or replica set results
in “flickering” errors on update_status

## Solution
Fix this by adding extra processing that checks if passwords + tls
settings are consistent across the cluster / replica set.
---
 lib/charms/mongodb/v1/shards_interface.py     | 18 +++---
 src/charm.py                                  | 61 +++++++++++++------
 .../sharding_tests/test_sharding.py           | 42 ++++++++++++-
 .../sharding_tests/test_sharding_backups.py   | 12 +---
 4 files changed, 94 insertions(+), 39 deletions(-)

diff --git a/lib/charms/mongodb/v1/shards_interface.py b/lib/charms/mongodb/v1/shards_interface.py
index ebd64b747..4203b5f08 100644
--- a/lib/charms/mongodb/v1/shards_interface.py
+++ b/lib/charms/mongodb/v1/shards_interface.py
@@ -56,7 +56,7 @@
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 10
+LIBPATCH = 11
 KEYFILE_KEY = "key-file"
 HOSTS_KEY = "host"
 OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username())
@@ -64,6 +64,8 @@
 INT_TLS_CA_KEY = f"int-{Config.TLS.SECRET_CA_LABEL}"
 FORBIDDEN_REMOVAL_ERR_CODE = 20
 AUTH_FAILED_CODE = 18
+UNAUTHORISED_CODE = 13
+TLS_CANNOT_FIND_PRIMARY = 133
 
 
 class ShardAuthError(Exception):
@@ -471,7 +473,7 @@ def cluster_password_synced(self) -> bool:
             with MongoDBConnection(self.charm.mongodb_config) as mongod:
                 mongod.get_replset_status()
         except OperationFailure as e:
-            if e.code == 18:  # Unauthorized Error - i.e. password is not in sync
+            if e.code in [UNAUTHORISED_CODE, AUTH_FAILED_CODE]:
                 return False
             raise
         except ServerSelectionTimeoutError:
@@ -999,9 +1001,10 @@ def _is_added_to_cluster(self) -> bool:
             return self.charm.app.name in cluster_shards
         except OperationFailure as e:
             if e.code in [
-                13,
-                18,
-            ]:  # [Unauthorized, AuthenticationFailed ]we are not yet connected to mongos
+                UNAUTHORISED_CODE,
+                AUTH_FAILED_CODE,
+                TLS_CANNOT_FIND_PRIMARY,
+            ]:
                 return False
 
             raise
@@ -1026,10 +1029,7 @@ def cluster_password_synced(self) -> bool:
             with MongoDBConnection(self.charm.mongodb_config) as mongo:
                 mongod_reachable = mongo.is_ready
         except OperationFailure as e:
-            if e.code in [
-                13,
-                18,
-            ]:  # [Unauthorized, AuthenticationFailed ]we are not yet connected to mongos
+            if e.code in [UNAUTHORISED_CODE, AUTH_FAILED_CODE, TLS_CANNOT_FIND_PRIMARY]:
                 return False
             raise
         except ServerSelectionTimeoutError:
diff --git a/src/charm.py b/src/charm.py
index b07f02879..618a8929a 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -9,7 +9,7 @@
 import subprocess
 import time
 from pathlib import Path
-from typing import Dict, List, Optional, Set
+from typing import Dict, List, Optional, Set, Tuple
 
 from charms.grafana_agent.v0.cos_agent import COSAgentProvider
 from charms.mongodb.v0.config_server_interface import ClusterProvider
@@ -72,12 +72,17 @@
     Unit,
     WaitingStatus,
 )
+from pymongo.errors import OperationFailure, ServerSelectionTimeoutError
 from tenacity import Retrying, before_log, retry, stop_after_attempt, wait_fixed
 
 from config import Config
 from exceptions import AdminUserCreationError, ApplicationHostNotFoundError
 from machine_helpers import MONGO_USER, ROOT_USER_GID, update_mongod_service
 
+AUTH_FAILED_CODE = 18
+UNAUTHORISED_CODE = 13
+TLS_CANNOT_FIND_PRIMARY = 133
+
 logger = logging.getLogger(__name__)
 
 APP_SCOPE = Config.Relations.APP_SCOPE
@@ -556,24 +561,13 @@ def _on_update_status(self, event: UpdateStatusEvent):
                     self.unit.status = WaitingStatus("Waiting for MongoDB to start")
                     return
 
-        # Cannot check more advanced MongoDB statuses if the cluster doesn't have passwords synced
-        # this can occur in two cases:
-        # 1. password rotation
-        # 2. race conditions when a new shard is addeded.
-        if (
-            not self.shard.cluster_password_synced()
-            or not self.config_server.cluster_password_synced()
-        ):
-            self.unit.status = WaitingStatus("Waiting to sync passwords across the cluster")
-            return
-
         # leader should periodically handle configuring the replica set. Incidents such as network
         # cuts can lead to new IP addresses and therefore will require a reconfigure. Especially
         # in the case that the leader a change in IP address it will not receive a relation event.
         if self.unit.is_leader():
             self._handle_reconfigure(event)
 
-        self.unit.status = self.get_status()
+        self.unit.status = self.process_statuses()
 
     def _on_get_primary_action(self, event: ActionEvent):
         event.set_results({"replica-set-primary": self.primary})
@@ -1406,18 +1400,17 @@ def get_invalid_integration_status(self) -> Optional[StatusBase]:
                 "Relation to s3-integrator is not supported, config role must be config-server"
             )
 
-    def get_status(self) -> StatusBase:
-        """Returns the status with the highest priority from backups, sharding, and mongod.
-
-        Note: it will never be the case that shard_status and config_server_status are both present
-        since the mongodb app can either be a shard or a config server, but not both.
-        """
-        # retrieve statuses of different services running on Charmed MongoDB
+    def get_statuses(self) -> Tuple:
+        """Retrieves statuses for the different processes running inside the unit."""
         mongodb_status = build_unit_status(self.mongodb_config, self._unit_ip(self.unit))
         shard_status = self.shard.get_shard_status()
         config_server_status = self.config_server.get_config_server_status()
         pbm_status = self.backups.get_pbm_status()
+        return (mongodb_status, shard_status, config_server_status, pbm_status)
 
+    def prioritize_statuses(self, statuses: Tuple) -> StatusBase:
+        """Returns the status with the highest priority from backups, sharding, and mongod."""
+        mongodb_status, shard_status, config_server_status, pbm_status = statuses
         # failure in mongodb takes precedence over sharding and config server
         if not isinstance(mongodb_status, ActiveStatus):
             return mongodb_status
@@ -1434,6 +1427,34 @@ def get_status(self) -> StatusBase:
         # if all statuses are active report mongodb status over sharding status
         return mongodb_status
 
+    def process_statuses(self) -> StatusBase:
+        """Retrieves statuses from processes inside charm and returns the highest priority status.
+
+        When a non-fatal error occurs while processing statuses, the error is processed and
+        returned as a statuses.
+        """
+        # retrieve statuses of different services running on Charmed MongoDB
+        deployment_mode = "replica set" if self.is_role(Config.Role.REPLICATION) else "cluster"
+        waiting_status = None
+        try:
+            statuses = self.get_statuses()
+        except OperationFailure as e:
+            if e.code in [UNAUTHORISED_CODE, AUTH_FAILED_CODE]:
+                waiting_status = f"Waiting to sync passwords across the {deployment_mode}"
+            elif e.code == TLS_CANNOT_FIND_PRIMARY:
+                waiting_status = (
+                    f"Waiting to sync internal membership across the {deployment_mode}"
+                )
+            else:
+                raise
+        except ServerSelectionTimeoutError:
+            waiting_status = f"Waiting to sync internal membership across the {deployment_mode}"
+
+        if waiting_status:
+            return WaitingStatus(waiting_status)
+
+        return self.prioritize_statuses(statuses)
+
     def is_relation_feasible(self, rel_interface) -> bool:
         """Returns true if the proposed relation is feasible."""
         if self.is_sharding_component() and rel_interface in Config.Relations.DB_RELATIONS:
diff --git a/tests/integration/sharding_tests/test_sharding.py b/tests/integration/sharding_tests/test_sharding.py
index 28b15a3b1..462ea8e1c 100644
--- a/tests/integration/sharding_tests/test_sharding.py
+++ b/tests/integration/sharding_tests/test_sharding.py
@@ -6,6 +6,7 @@
 import pytest
 from pytest_operator.plugin import OpsTest
 
+from ..helpers import get_leader_id, get_password, set_password
 from .helpers import (
     generate_mongodb_client,
     has_correct_shards,
@@ -17,10 +18,18 @@
 SHARD_ONE_APP_NAME = "shard-one"
 SHARD_TWO_APP_NAME = "shard-two"
 SHARD_THREE_APP_NAME = "shard-three"
-SHARD_APPS = [SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME]
+SHARD_APPS = [SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME, SHARD_THREE_APP_NAME]
 CONFIG_SERVER_APP_NAME = "config-server-one"
+CLUSTER_APPS = [
+    CONFIG_SERVER_APP_NAME,
+    SHARD_ONE_APP_NAME,
+    SHARD_TWO_APP_NAME,
+    SHARD_THREE_APP_NAME,
+]
 SHARD_REL_NAME = "sharding"
 CONFIG_SERVER_REL_NAME = "config-server"
+OPERATOR_USERNAME = "operator"
+OPERATOR_PASSWORD = "operator-password"
 MONGODB_KEYFILE_PATH = "/var/snap/charmed-mongodb/current/etc/mongod/keyFile"
 # for now we have a large timeout due to the slow drainage of the `config.system.sessions`
 # collection. More info here:
@@ -122,6 +131,37 @@ async def test_cluster_active(ops_test: OpsTest) -> None:
     ), "Config server did not process config properly"
 
 
+@pytest.mark.group(1)
+async def test_set_operator_password(ops_test: OpsTest):
+    """Tests that the cluster can safely set the operator password."""
+    for cluster_app_name in CLUSTER_APPS:
+        operator_password = await get_password(
+            ops_test, username=OPERATOR_USERNAME, app_name=cluster_app_name
+        )
+        assert (
+            operator_password != OPERATOR_PASSWORD
+        ), f"{cluster_app_name} is incorrectly already set to the new password."
+
+    # rotate password and verify that no unit goes into error as a result of password rotation
+    config_leader_id = await get_leader_id(ops_test, app_name=CONFIG_SERVER_APP_NAME)
+    await set_password(
+        ops_test, unit_id=config_leader_id, username=OPERATOR_USERNAME, password=OPERATOR_PASSWORD
+    )
+    await ops_test.model.wait_for_idle(
+        apps=CLUSTER_APPS,
+        status="active",
+        idle_period=20,
+    ),
+
+    for cluster_app_name in CLUSTER_APPS:
+        operator_password = await get_password(
+            ops_test, username=OPERATOR_USERNAME, app_name=cluster_app_name
+        )
+        assert (
+            operator_password == OPERATOR_PASSWORD
+        ), f"{cluster_app_name} did not rotate to new password."
+
+
 @pytest.mark.group(1)
 @pytest.mark.abort_on_fail
 async def test_sharding(ops_test: OpsTest) -> None:
diff --git a/tests/integration/sharding_tests/test_sharding_backups.py b/tests/integration/sharding_tests/test_sharding_backups.py
index 65db8f6e0..cf70b06b8 100644
--- a/tests/integration/sharding_tests/test_sharding_backups.py
+++ b/tests/integration/sharding_tests/test_sharding_backups.py
@@ -274,14 +274,10 @@ async def test_restore_backup(ops_test: OpsTest, add_writes_to_shards) -> None:
 async def test_migrate_restore_backup(ops_test: OpsTest, add_writes_to_shards) -> None:
     """Tests that sharded Charmed MongoDB cluster supports restores."""
     config_leader_id = await get_leader_id(ops_test, app_name=CONFIG_SERVER_APP_NAME)
-    # TODO Future Work - determine the source of the temporary error state that occurs during the
-    # rotation of the operator user.
     await set_password(
         ops_test, unit_id=config_leader_id, username="operator", password=OPERATOR_PASSWORD
     )
-    await ops_test.model.wait_for_idle(
-        apps=CLUSTER_APPS, status="active", idle_period=20, raise_on_error=False
-    ),
+    await ops_test.model.wait_for_idle(apps=CLUSTER_APPS, status="active", idle_period=20)
 
     # count total writes
     cluster_writes = await writes_helpers.get_cluster_writes_count(
@@ -326,14 +322,12 @@ async def test_migrate_restore_backup(ops_test: OpsTest, add_writes_to_shards) -
     await deploy_cluster_backup_test(ops_test, deploy_s3_integrator=False)
     await setup_cluster_and_s3(ops_test)
     config_leader_id = await get_leader_id(ops_test, app_name=CONFIG_SERVER_APP_NAME)
-    # TODO Future Work - determine the source of the temporary error state that occurs during the
-    # rotation of the operator user.
     await set_password(
         ops_test, unit_id=config_leader_id, username="operator", password=OPERATOR_PASSWORD
     )
     await ops_test.model.wait_for_idle(
-        apps=CLUSTER_APPS, status="active", idle_period=20, timeout=TIMEOUT, raise_on_error=False
-    ),
+        apps=CLUSTER_APPS, status="active", idle_period=20, timeout=TIMEOUT
+    )
 
     # find most recent backup id and restore
     leader_unit = await backup_helpers.get_leader_unit(