From 1eaae433ab98a6935109ece4638e7653a2e84bf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nehalenni=C3=A6=20Oudin?= <17551419+Gu1nness@users.noreply.github.com> Date: Mon, 22 Jul 2024 14:01:53 +0200 Subject: [PATCH] [DPE-4663]: Fix flaky sharding + tls tests (#433) ## Issue * The Sharding + TLS test is flaky on `test_tls_inconsistent_rels`. * Sometimes it stalls and a unit stays in state `[active] executing` after we remove TLS * Sometimes, `juju status` returns an error because a volume is defined but not filesystem associated with it can be found. ## Solution * First problem : the `destroy_cluster` method was using `ops_test.fast_forward` in an unsafe way which globally lowered the `update-status-hook-interval` to 10s which is way too low and would stall a unit. * Second problem : Update `destroy_cluster` to destroy the storage safely when destroying the cluster so we are sure that the volumes are cleaned before doing anymore tests. * On top of that, two minor fixes, one making a call to `juju status` fail on errors, and an update in the charm to prevent `AttributeError`s when accessing `self.peers.units` in an unsafe way. --------- Co-authored-by: Mia Altieri <32723809+MiaAltieri@users.noreply.github.com> --- charm_internal_version | 2 +- src/charm.py | 16 +++++++++++----- tests/integration/helpers.py | 16 +++++++++------- tests/integration/sharding_tests/helpers.py | 11 +++++++---- .../sharding_tests/test_sharding_tls.py | 6 +++--- 5 files changed, 31 insertions(+), 20 deletions(-) diff --git a/charm_internal_version b/charm_internal_version index d00491fd7..0cfbf0888 100644 --- a/charm_internal_version +++ b/charm_internal_version @@ -1 +1 @@ -1 +2 diff --git a/src/charm.py b/src/charm.py index b5cd9c279..71c1c8a66 100755 --- a/src/charm.py +++ b/src/charm.py @@ -208,7 +208,7 @@ def primary(self) -> str: return self.unit.name # check if peer unit matches primary ip - for unit in self.peers.units: + for unit in self.peers_units: if primary_ip == self.unit_ip(unit): return unit.name @@ -230,9 +230,7 @@ def unit_ips(self) -> List[str]: Returns: a list of IP address associated with MongoDB application. """ - peer_addresses = [] - if self.peers: - peer_addresses = [self.unit_ip(unit) for unit in self.peers.units] + peer_addresses = [self.unit_ip(unit) for unit in self.peers_units] logger.debug("peer addresses: %s", peer_addresses) self_address = self.unit_ip(self.unit) @@ -314,6 +312,14 @@ def peers(self) -> Optional[Relation]: """ return self.model.get_relation(Config.Relations.PEERS) + @property + def peers_units(self) -> list[Unit]: + """Get peers units in a safe way.""" + if not self.peers: + return [] + else: + return self.peers.units + @property def db_initialised(self) -> bool: """Check if MongoDB is initialised.""" @@ -1509,7 +1515,7 @@ def _generate_relation_departed_key(rel_id: int) -> str: @property def _is_removing_last_replica(self) -> bool: """Returns True if the last replica (juju unit) is getting removed.""" - return self.app.planned_units() == 0 and len(self.peers.units) == 0 + return self.app.planned_units() == 0 and len(self.peers_units) == 0 def get_invalid_integration_status(self) -> Optional[StatusBase]: """Returns a status if an invalid integration is present.""" diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 9dd7895bb..7ecb2c50c 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -362,13 +362,15 @@ async def get_unit_hostname(ops_test: OpsTest, unit_id: int, app: str) -> str: return hostname.strip() -def get_raw_application(ops_test: OpsTest, app: str) -> Dict[str, Any]: +async def get_raw_application(ops_test: OpsTest, app: str) -> Dict[str, Any]: """Get raw application details.""" - return json.loads( - subprocess.check_output( - f"juju status --model {ops_test.model.info.name} {app} --format=json".split() - ) - )["applications"][app] + ret_code, stdout, stderr = await ops_test.juju( + *f"status --model {ops_test.model.info.name} {app} --format=json".split() + ) + if ret_code != 0: + logger.error(f"Invalid return [{ret_code=}]: {stderr=}") + raise Exception(f"[{ret_code=}] {stderr=}") + return json.loads(stdout)["applications"][app] async def get_application_units(ops_test: OpsTest, app: str) -> List[Unit]: @@ -376,7 +378,7 @@ async def get_application_units(ops_test: OpsTest, app: str) -> List[Unit]: # Juju incorrectly reports the IP addresses after the network is restored this is reported as a # bug here: https://github.com/juju/python-libjuju/issues/738. Once this bug is resolved use of # `get_unit_ip` should be replaced with `.public_address` - raw_app = get_raw_application(ops_test, app) + raw_app = await get_raw_application(ops_test, app) units = [] for u_name, unit in raw_app["units"].items(): unit_id = int(u_name.split("/")[-1]) diff --git a/tests/integration/sharding_tests/helpers.py b/tests/integration/sharding_tests/helpers.py index 24b640ebb..742c3cd29 100644 --- a/tests/integration/sharding_tests/helpers.py +++ b/tests/integration/sharding_tests/helpers.py @@ -157,7 +157,9 @@ async def deploy_cluster_components( async def destroy_cluster(ops_test): """Destroy cluster in a forceful way.""" for app in CLUSTER_COMPONENTS: - await ops_test.model.applications[app].destroy(force=True, no_wait=False) + await ops_test.model.applications[app].destroy( + destroy_storage=True, force=True, no_wait=False + ) # destroy does not wait for applications to be removed, perform this check manually for attempt in Retrying(stop=stop_after_attempt(100), wait=wait_fixed(10), reraise=True): @@ -165,9 +167,10 @@ async def destroy_cluster(ops_test): # pytest_operator has a bug where the number of applications does not get correctly # updated. Wrapping the call with `fast_forward` resolves this async with ops_test.fast_forward(): - assert ( - len(ops_test.model.applications) == 1 - ), "old cluster not destroyed successfully." + n_applications = len(ops_test.model.applications) + # This case we don't raise an error in the context manager which + # fails to restore the `update-status-hook-interval` value to it's former state. + assert n_applications == 1, "old cluster not destroyed successfully." async def integrate_cluster(ops_test: OpsTest) -> None: diff --git a/tests/integration/sharding_tests/test_sharding_tls.py b/tests/integration/sharding_tests/test_sharding_tls.py index 23c1dd8f6..09a956b3c 100644 --- a/tests/integration/sharding_tests/test_sharding_tls.py +++ b/tests/integration/sharding_tests/test_sharding_tls.py @@ -126,7 +126,7 @@ async def test_tls_inconsistent_rels(ops_test: OpsTest) -> None: ) await wait_for_mongodb_units_blocked( - ops_test, SHARD_ONE_APP_NAME, status="Shard requires TLS to be enabled.", timeout=300 + ops_test, SHARD_ONE_APP_NAME, status="Shard requires TLS to be enabled.", timeout=450 ) # Re-integrate to bring cluster back to steady state @@ -159,7 +159,7 @@ async def test_tls_inconsistent_rels(ops_test: OpsTest) -> None: ops_test, SHARD_ONE_APP_NAME, status="Shard has TLS enabled, but config-server does not.", - timeout=300, + timeout=450, ) # CASE 3: Cluster components are using different CA's @@ -180,7 +180,7 @@ async def test_tls_inconsistent_rels(ops_test: OpsTest) -> None: ops_test, SHARD_ONE_APP_NAME, status="Shard CA and Config-Server CA don't match.", - timeout=300, + timeout=450, )