Skip to content

Commit

Permalink
[DPE-4663]: Fix flaky sharding + tls tests (#433)
Browse files Browse the repository at this point in the history
## Issue
 
* The Sharding + TLS test is flaky on `test_tls_inconsistent_rels`.
* Sometimes it stalls and a unit stays in state `[active] executing`
after we remove TLS
* Sometimes, `juju status` returns an error because a volume is defined
but not filesystem associated with it can be found.

## Solution
* First problem : the `destroy_cluster` method was using
`ops_test.fast_forward` in an unsafe way which globally lowered the
`update-status-hook-interval` to 10s which is way too low and would
stall a unit.
* Second problem : Update `destroy_cluster` to destroy the storage
safely when destroying the cluster so we are sure that the volumes are
cleaned before doing anymore tests.
* On top of that, two minor fixes, one making a call to `juju status`
fail on errors, and an update in the charm to prevent `AttributeError`s
when accessing `self.peers.units` in an unsafe way.

---------

Co-authored-by: Mia Altieri <[email protected]>
  • Loading branch information
Gu1nness and MiaAltieri authored Jul 22, 2024
1 parent b33d036 commit 1eaae43
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 20 deletions.
2 changes: 1 addition & 1 deletion charm_internal_version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1
2
16 changes: 11 additions & 5 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def primary(self) -> str:
return self.unit.name

# check if peer unit matches primary ip
for unit in self.peers.units:
for unit in self.peers_units:
if primary_ip == self.unit_ip(unit):
return unit.name

Expand All @@ -230,9 +230,7 @@ def unit_ips(self) -> List[str]:
Returns:
a list of IP address associated with MongoDB application.
"""
peer_addresses = []
if self.peers:
peer_addresses = [self.unit_ip(unit) for unit in self.peers.units]
peer_addresses = [self.unit_ip(unit) for unit in self.peers_units]

logger.debug("peer addresses: %s", peer_addresses)
self_address = self.unit_ip(self.unit)
Expand Down Expand Up @@ -314,6 +312,14 @@ def peers(self) -> Optional[Relation]:
"""
return self.model.get_relation(Config.Relations.PEERS)

@property
def peers_units(self) -> list[Unit]:
"""Get peers units in a safe way."""
if not self.peers:
return []
else:
return self.peers.units

@property
def db_initialised(self) -> bool:
"""Check if MongoDB is initialised."""
Expand Down Expand Up @@ -1509,7 +1515,7 @@ def _generate_relation_departed_key(rel_id: int) -> str:
@property
def _is_removing_last_replica(self) -> bool:
"""Returns True if the last replica (juju unit) is getting removed."""
return self.app.planned_units() == 0 and len(self.peers.units) == 0
return self.app.planned_units() == 0 and len(self.peers_units) == 0

def get_invalid_integration_status(self) -> Optional[StatusBase]:
"""Returns a status if an invalid integration is present."""
Expand Down
16 changes: 9 additions & 7 deletions tests/integration/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,21 +362,23 @@ async def get_unit_hostname(ops_test: OpsTest, unit_id: int, app: str) -> str:
return hostname.strip()


def get_raw_application(ops_test: OpsTest, app: str) -> Dict[str, Any]:
async def get_raw_application(ops_test: OpsTest, app: str) -> Dict[str, Any]:
"""Get raw application details."""
return json.loads(
subprocess.check_output(
f"juju status --model {ops_test.model.info.name} {app} --format=json".split()
)
)["applications"][app]
ret_code, stdout, stderr = await ops_test.juju(
*f"status --model {ops_test.model.info.name} {app} --format=json".split()
)
if ret_code != 0:
logger.error(f"Invalid return [{ret_code=}]: {stderr=}")
raise Exception(f"[{ret_code=}] {stderr=}")
return json.loads(stdout)["applications"][app]


async def get_application_units(ops_test: OpsTest, app: str) -> List[Unit]:
"""Get fully detailed units of an application."""
# Juju incorrectly reports the IP addresses after the network is restored this is reported as a
# bug here: https://github.com/juju/python-libjuju/issues/738. Once this bug is resolved use of
# `get_unit_ip` should be replaced with `.public_address`
raw_app = get_raw_application(ops_test, app)
raw_app = await get_raw_application(ops_test, app)
units = []
for u_name, unit in raw_app["units"].items():
unit_id = int(u_name.split("/")[-1])
Expand Down
11 changes: 7 additions & 4 deletions tests/integration/sharding_tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,17 +157,20 @@ async def deploy_cluster_components(
async def destroy_cluster(ops_test):
"""Destroy cluster in a forceful way."""
for app in CLUSTER_COMPONENTS:
await ops_test.model.applications[app].destroy(force=True, no_wait=False)
await ops_test.model.applications[app].destroy(
destroy_storage=True, force=True, no_wait=False
)

# destroy does not wait for applications to be removed, perform this check manually
for attempt in Retrying(stop=stop_after_attempt(100), wait=wait_fixed(10), reraise=True):
with attempt:
# pytest_operator has a bug where the number of applications does not get correctly
# updated. Wrapping the call with `fast_forward` resolves this
async with ops_test.fast_forward():
assert (
len(ops_test.model.applications) == 1
), "old cluster not destroyed successfully."
n_applications = len(ops_test.model.applications)
# This case we don't raise an error in the context manager which
# fails to restore the `update-status-hook-interval` value to it's former state.
assert n_applications == 1, "old cluster not destroyed successfully."


async def integrate_cluster(ops_test: OpsTest) -> None:
Expand Down
6 changes: 3 additions & 3 deletions tests/integration/sharding_tests/test_sharding_tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ async def test_tls_inconsistent_rels(ops_test: OpsTest) -> None:
)

await wait_for_mongodb_units_blocked(
ops_test, SHARD_ONE_APP_NAME, status="Shard requires TLS to be enabled.", timeout=300
ops_test, SHARD_ONE_APP_NAME, status="Shard requires TLS to be enabled.", timeout=450
)

# Re-integrate to bring cluster back to steady state
Expand Down Expand Up @@ -159,7 +159,7 @@ async def test_tls_inconsistent_rels(ops_test: OpsTest) -> None:
ops_test,
SHARD_ONE_APP_NAME,
status="Shard has TLS enabled, but config-server does not.",
timeout=300,
timeout=450,
)

# CASE 3: Cluster components are using different CA's
Expand All @@ -180,7 +180,7 @@ async def test_tls_inconsistent_rels(ops_test: OpsTest) -> None:
ops_test,
SHARD_ONE_APP_NAME,
status="Shard CA and Config-Server CA don't match.",
timeout=300,
timeout=450,
)


Expand Down

0 comments on commit 1eaae43

Please sign in to comment.