Skip to content

Commit

Permalink
[DPE-3291] Fix race condition in sharding relations (#338)
Browse files Browse the repository at this point in the history
## Issue
When sharding components (config-server + shards) are deployed and
immediately related/integrated, the model hangs [see chat
here](https://chat.charmhub.io/charmhub/pl/5baegxny5jro5md71gew8am5dy)

## Reason
The model hangs because the relation hooks execute before both/either
leader-elected + config changed, this results in `pass_hook_checks` not
deferring the event, meaning that the event doesn't execute its
necessary code and the model forever hangs waiting for an event to be
re-emitted that will never be re-emitted

## Solution
Update the order for checks in `pass_hook_checks` so that there is
**always** a deferral if the database has not been initialised.

## Extra
Update this scheme throughout the code base 

## Tests
An extra test suite was added for this case
  • Loading branch information
MiaAltieri authored Jan 18, 2024
1 parent f92c12f commit 678f839
Show file tree
Hide file tree
Showing 6 changed files with 175 additions and 23 deletions.
12 changes: 6 additions & 6 deletions lib/charms/mongodb/v0/config_server_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 4
LIBPATCH = 5


class ClusterProvider(Object):
Expand Down Expand Up @@ -64,6 +64,11 @@ def __init__(

def pass_hook_checks(self, event: EventBase) -> bool:
"""Runs the pre-hooks checks for ClusterProvider, returns True if all pass."""
if not self.charm.db_initialised:
logger.info("Deferring %s. db is not initialised.", type(event))
event.defer()
return False

if not self.charm.is_role(Config.Role.CONFIG_SERVER):
logger.info(
"Skipping %s. ShardingProvider is only be executed by config-server", type(event)
Expand All @@ -73,11 +78,6 @@ def pass_hook_checks(self, event: EventBase) -> bool:
if not self.charm.unit.is_leader():
return False

if not self.charm.db_initialised:
logger.info("Deferring %s. db is not initialised.", type(event))
event.defer()
return False

return True

def _on_relation_changed(self, event) -> None:
Expand Down
12 changes: 6 additions & 6 deletions lib/charms/mongodb/v1/mongodb_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 3
LIBPATCH = 4

logger = logging.getLogger(__name__)
REL_NAME = "database"
Expand Down Expand Up @@ -87,6 +87,11 @@ def __init__(self, charm: CharmBase, substrate="k8s", relation_name: str = "data

def pass_hook_checks(self) -> bool:
"""Runs the pre-hooks checks for MongoDBProvider, returns True if all pass."""
# We shouldn't try to create or update users if the database is not
# initialised. We will create users as part of initialisation.
if not self.charm.db_initialised:
return False

if not self.charm.is_relation_feasible(self.relation_name):
logger.info("Skipping code for relations.")
return False
Expand All @@ -100,11 +105,6 @@ def pass_hook_checks(self) -> bool:
if not self.charm.unit.is_leader():
return False

# We shouldn't try to create or update users if the database is not
# initialised. We will create users as part of initialisation.
if not self.charm.db_initialised:
return False

return True

def _on_relation_event(self, event):
Expand Down
22 changes: 11 additions & 11 deletions lib/charms/mongodb/v1/shards_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 4
LIBPATCH = 5
KEYFILE_KEY = "key-file"
HOSTS_KEY = "host"
OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username())
Expand Down Expand Up @@ -121,6 +121,11 @@ def _on_relation_joined(self, event):

def pass_hook_checks(self, event: EventBase) -> bool:
"""Runs the pre-hooks checks for ShardingProvider, returns True if all pass."""
if not self.charm.db_initialised:
logger.info("Deferring %s. db is not initialised.", type(event))
event.defer()
return False

if not self.charm.is_relation_feasible(self.relation_name):
logger.info("Skipping event %s , relation not feasible.", type(event))
return False
Expand All @@ -134,11 +139,6 @@ def pass_hook_checks(self, event: EventBase) -> bool:
if not self.charm.unit.is_leader():
return False

if not self.charm.db_initialised:
logger.info("Deferring %s. db is not initialised.", type(event))
event.defer()
return False

# adding/removing shards while a backup/restore is in progress can be disastrous
pbm_status = self.charm.backups.get_pbm_status()
if isinstance(pbm_status, MaintenanceStatus):
Expand Down Expand Up @@ -507,6 +507,11 @@ def _on_relation_changed(self, event):

def pass_hook_checks(self, event):
"""Runs the pre-hooks checks for ConfigServerRequirer, returns True if all pass."""
if not self.charm.db_initialised:
logger.info("Deferring %s. db is not initialised.", type(event))
event.defer()
return False

if not self.charm.is_relation_feasible(self.relation_name):
logger.info("Skipping event %s , relation not feasible.", type(event))
return False
Expand All @@ -515,11 +520,6 @@ def pass_hook_checks(self, event):
logger.info("skipping %s is only be executed by shards", type(event))
return False

if not self.charm.db_initialised:
logger.info("Deferring %s. db is not initialised.", type(event))
event.defer()
return False

mongos_hosts = event.relation.data[event.relation.app].get(HOSTS_KEY, None)
if isinstance(event, RelationBrokenEvent) and not mongos_hosts:
logger.info("Config-server relation never set up, no need to process broken event.")
Expand Down
80 changes: 80 additions & 0 deletions tests/integration/sharding_tests/test_sharding_race_conds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env python3
# Copyright 2023 Canonical Ltd.
# See LICENSE file for licensing details.

import pytest
from pytest_operator.plugin import OpsTest

from .helpers import generate_mongodb_client, has_correct_shards

SHARD_ONE_APP_NAME = "shard-one"
SHARD_TWO_APP_NAME = "shard-two"
SHARD_THREE_APP_NAME = "shard-three"
SHARD_APPS = [SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME]
CONFIG_SERVER_APP_NAME = "config-server-one"
SHARD_REL_NAME = "sharding"
CONFIG_SERVER_REL_NAME = "config-server"
MONGODB_KEYFILE_PATH = "/var/snap/charmed-mongodb/current/etc/mongod/keyFile"

TIMEOUT = 60 * 10


@pytest.mark.abort_on_fail
async def test_build_and_deploy(ops_test: OpsTest) -> None:
"""Build and deploy a sharded cluster."""
my_charm = await ops_test.build_charm(".")
await ops_test.model.deploy(
my_charm,
num_units=2,
config={"role": "config-server"},
application_name=CONFIG_SERVER_APP_NAME,
)
await ops_test.model.deploy(
my_charm, num_units=2, config={"role": "shard"}, application_name=SHARD_ONE_APP_NAME
)
await ops_test.model.deploy(
my_charm, num_units=2, config={"role": "shard"}, application_name=SHARD_TWO_APP_NAME
)
await ops_test.model.deploy(
my_charm, num_units=2, config={"role": "shard"}, application_name=SHARD_THREE_APP_NAME
)


@pytest.mark.abort_on_fail
async def test_immediate_relate(ops_test: OpsTest) -> None:
"""Tests the immediate integration of cluster components works without error."""
await ops_test.model.integrate(
f"{SHARD_ONE_APP_NAME}:{SHARD_REL_NAME}",
f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
)
await ops_test.model.integrate(
f"{SHARD_TWO_APP_NAME}:{SHARD_REL_NAME}",
f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
)
await ops_test.model.integrate(
f"{SHARD_THREE_APP_NAME}:{SHARD_REL_NAME}",
f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
)

await ops_test.model.wait_for_idle(
apps=[
CONFIG_SERVER_APP_NAME,
SHARD_ONE_APP_NAME,
SHARD_TWO_APP_NAME,
SHARD_THREE_APP_NAME,
],
idle_period=20,
status="active",
timeout=TIMEOUT,
raise_on_error=False,
)

mongos_client = await generate_mongodb_client(
ops_test, app_name=CONFIG_SERVER_APP_NAME, mongos=True
)

# verify sharded cluster config
assert has_correct_shards(
mongos_client,
expected_shards=[SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME, SHARD_THREE_APP_NAME],
), "Config server did not process config properly"
56 changes: 56 additions & 0 deletions tests/unit/test_config_server_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,59 @@ def is_config_mock_call(*args):
self.harness.set_leader(False)
self.harness.charm.cluster.update_config_server_db(mock.Mock())
self.harness.charm.cluster.database_provides.update_relation_data.assert_not_called()

def test_pass_hooks_check_waits_for_start_config_server(self):
"""Ensure that pass_hooks defers until the database is initialized.
Note: in some cases sharding related hooks execute before config and leader elected hooks,
therefore it is important that the `pass_hooks_check` defers an event until the database
has been started
"""

def is_shard_mock_call(*args):
return args == ("shard")

self.harness.charm.is_role = is_shard_mock_call

event = mock.Mock()
event.params = {}

self.harness.set_leader(False)
self.harness.charm.config_server.pass_hook_checks(event)
event.defer.assert_called()

# once the database has been initialised, pass hooks check should no longer defer if the
# unit is not the leader nor is the wrong wrole
event = mock.Mock()
event.params = {}
self.harness.charm.app_peer_data["db_initialised"] = "True"
self.harness.charm.config_server.pass_hook_checks(event)
event.defer.assert_not_called()

def test_pass_hooks_check_waits_for_start_shard(self):
"""Ensure that pass_hooks defers until the database is initialized.
Note: in some cases sharding related hooks execute before config and leader elected hooks,
therefore it is important that the `pass_hooks_check` defers an event until the database
has been started
"""

def is_config_mock_call(*args):
return args == ("config-server")

self.harness.charm.is_role = is_config_mock_call

event = mock.Mock()
event.params = {}

self.harness.set_leader(False)
self.harness.charm.shard.pass_hook_checks(event)
event.defer.assert_called()

# once the database has been initialised, pass hooks check should no longer defer if the
# unit is not the leader nor is the wrong wrole
event = mock.Mock()
event.params = {}
self.harness.charm.app_peer_data["db_initialised"] = "True"
self.harness.charm.shard.pass_hook_checks(event)
event.defer.assert_not_called()
16 changes: 16 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,22 @@ deps =
commands =
pytest -v --tb native --log-cli-level=INFO -s --durations=0 {posargs} {[vars]tests_path}/integration/sharding_tests/test_sharding_relations.py

[testenv:sharding-race-conditions]
description = Run sharding race condition tests
pass_env =
{[testenv]pass_env}
CI
deps =
pytest
juju==3.2.0.1
pytest-mock
pytest-operator
protobuf==3.20 # temporary fix until new libjuju is released
git+https://github.com/canonical/data-platform-workflows@v8\#subdirectory=python/pytest_plugins/pytest_operator_cache
-r {tox_root}/requirements.txt
commands =
pytest -v --tb native --log-cli-level=INFO -s --durations=0 {posargs} {[vars]tests_path}/integration/sharding_tests/test_sharding_race_conds.py


[testenv:integration]
description = Run all integration tests
Expand Down

0 comments on commit 678f839

Please sign in to comment.