Skip to content

Commit

Permalink
[DPE-4755] [DPE-4753] Implement upgrade proceedure and basic integrat…
Browse files Browse the repository at this point in the history
…ion tests for upgrading a sharded cluster (#428)

## Issue
Missing tests that verify:
1. success of pre-upgrade check
2. failure of pre-upgrade check
3. success of upgrade operations

## Solution
Add missing tests

## Notes:
The upgrade test is disabled until the upgrade work is published on
charmhub, but the test suite was ran manually by building the charm with
an earlier snap and upgrading the charm to a newer snap revision

## Future Work
Once the upgrade work is published on charm hub
1. Deploy charmcraft charm in the test deployment step
2. Re-enable upgrade test

---------

Co-authored-by: Ubuntu <[email protected]>
  • Loading branch information
MiaAltieri and Ubuntu authored Jul 9, 2024
1 parent ae2afe9 commit 406e341
Show file tree
Hide file tree
Showing 11 changed files with 538 additions and 110 deletions.
15 changes: 6 additions & 9 deletions lib/charms/mongodb/v0/set_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 1
LIBPATCH = 2


class MongoDBStatusHandler(Object):
Expand Down Expand Up @@ -105,20 +105,17 @@ def share_status_to_config_server(self):

def is_unit_status_ready_for_upgrade(self) -> bool:
"""Returns True if the status of the current unit reflects that it is ready for upgrade."""
current_status = type(self.charm.unit.status)
status_message = self.charm.unit.status.message
current_status = self.charm.unit.status
status_message = current_status.message
if isinstance(current_status, ActiveStatus):
return True

if not isinstance(current_status, WaitingStatus):
return False

if (
status_message
and status_message != Config.Status.CONFIG_SERVER_WAITING_FOR_REFRESH.message
):
return False
if status_message and "is not up-to date with config-server" in status_message:
return True

return True
return False

# END: Helpers
4 changes: 0 additions & 4 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@

from typing import Literal, TypeAlias

from ops.model import WaitingStatus

Package: TypeAlias = tuple[str, str, str]


Expand Down Expand Up @@ -116,10 +114,8 @@ class Status:
"""

STATUS_READY_FOR_UPGRADE = "status-shows-ready-for-upgrade"
CONFIG_SERVER_WAITING_FOR_REFRESH = WaitingStatus("Waiting for refresh command.")

class Upgrade:
"""Upgrade related constants."""

WAITING_FOR_REFRESH_KEY = "waiting_for_refresh"
FEATURE_VERSION_6 = "6.0"
14 changes: 0 additions & 14 deletions src/upgrades/mongodb_upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,6 @@ def _on_upgrade_charm(self, _):
if not self._upgrade.in_progress:
logger.info("Charm upgraded. MongoDB version unchanged")

if self.charm.is_role(Config.Role.CONFIG_SERVER):
# The actions performed as a pre-upgrade check negatively impact cluster
# performance i.e. disabling the balancer. Users should NOT run the
# pre-upgrade-check action and never run the juju refresh command. Once the users
# have ran the refresh we no longer have to worry about having to wait
del self.charm.app_peer_data[Config.Upgrade.WAITING_FOR_REFRESH_KEY]

self._upgrade.upgrade_resumed = False
# Only call `_reconcile_upgrade` on leader unit to avoid race conditions with
# `upgrade_resumed`
Expand Down Expand Up @@ -626,13 +619,6 @@ def turn_off_and_wait_for_balancer(self):
if balancer_state["mode"] != "off":
raise BalancerStillRunningError("balancer is still Running.")

def is_config_server_waiting_for_refresh(self) -> bool:
"""Returns true if pre-upgrade check ran and config-server is waiting to be refreshed."""
if not self.charm.is_role(Config.Role.CONFIG_SERVER):
return False

return Config.Upgrade.WAITING_FOR_REFRESH_KEY in self.charm.app_peer_data

# END: helpers

# BEGIN: properties
Expand Down
39 changes: 39 additions & 0 deletions tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@
import os
from pathlib import Path

import pytest
from pytest_operator.plugin import OpsTest

from .sharding_tests import writes_helpers as writes_helpers


def ops_test(ops_test: OpsTest) -> OpsTest:
if os.environ.get("CI") == "true":
Expand All @@ -24,3 +27,39 @@ async def build_charm(charm_path, bases_index: int = None) -> Path:

ops_test.build_charm = build_charm
return ops_test


@pytest.fixture()
async def continuous_writes_to_shard_one(ops_test: OpsTest):
"""Adds writes to a shard named shard-one before test starts and clears writes at the end."""
await writes_helpers.start_continous_writes_on_shard(
ops_test,
shard_name=writes_helpers.SHARD_ONE_APP_NAME,
db_name=writes_helpers.SHARD_ONE_DB_NAME,
)

yield
await writes_helpers.stop_continous_writes(
ops_test,
config_server_name=writes_helpers.CONFIG_SERVER_APP_NAME,
db_name=writes_helpers.SHARD_ONE_DB_NAME,
)
await writes_helpers.remove_db_writes(ops_test, db_name=writes_helpers.SHARD_ONE_DB_NAME)


@pytest.fixture()
async def continuous_writes_to_shard_two(ops_test: OpsTest):
"""Adds writes to a shard named shard-one before test starts and clears writes at the end."""
await writes_helpers.start_continous_writes_on_shard(
ops_test,
shard_name=writes_helpers.SHARD_TWO_APP_NAME,
db_name=writes_helpers.SHARD_TWO_DB_NAME,
)

yield
await writes_helpers.stop_continous_writes(
ops_test,
config_server_name=writes_helpers.CONFIG_SERVER_APP_NAME,
db_name=writes_helpers.SHARD_TWO_DB_NAME,
)
await writes_helpers.remove_db_writes(ops_test, db_name=writes_helpers.SHARD_TWO_DB_NAME)
30 changes: 18 additions & 12 deletions tests/integration/ha_tests/continuous_writes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,28 @@
import sys

from pymongo import MongoClient
from pymongo.errors import AutoReconnect, NotPrimaryError, PyMongoError
from pymongo.errors import PyMongoError
from pymongo.write_concern import WriteConcern

DEFAULT_DB_NAME = "new-db"
DEFAULT_COLL_NAME = "test_collection"

def continous_writes(connection_string: str, starting_number: int):

def continous_writes(
connection_string: str,
starting_number: int,
db_name: str,
coll_name: str,
):
write_value = starting_number

while True:
client = MongoClient(
connection_string,
socketTimeoutMS=5000,
)
db = client["new-db"]
test_collection = db["test_collection"]
db = client[db_name]
test_collection = db[coll_name]
try:
# insert item into collection if it doesn't already exist
test_collection.with_options(
Expand All @@ -30,15 +38,11 @@ def continous_writes(connection_string: str, starting_number: int):
).update_one({"number": write_value}, {"$set": {"number": write_value}}, upsert=True)

# update_one
except (NotPrimaryError, AutoReconnect):
# this means that the primary was not able to be found. An application should try to
# reconnect and re-write the previous value. Hence, we `continue` here, without
except PyMongoError:
# PyMongoErors should result in an attempt to retry a write. An application should
# try to reconnect and re-write the previous value. Hence, we `continue` here, without
# incrementing `write_value` as to try to insert this value again.
continue
except PyMongoError:
# we should not raise this exception but instead increment the write value and move
# on, indicating that there was a failure writing to the database.
pass
finally:
client.close()

Expand All @@ -48,7 +52,9 @@ def continous_writes(connection_string: str, starting_number: int):
def main():
connection_string = sys.argv[1]
starting_number = int(sys.argv[2])
continous_writes(connection_string, starting_number)
db_name = DEFAULT_DB_NAME if len(sys.argv) < 4 else sys.argv[3]
coll_name = DEFAULT_COLL_NAME if len(sys.argv) < 5 else sys.argv[4]
continous_writes(connection_string, starting_number, db_name, coll_name)


if __name__ == "__main__":
Expand Down
75 changes: 74 additions & 1 deletion tests/integration/sharding_tests/helpers.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,28 @@
#!/usr/bin/env python3
# Copyright 2023 Canonical Ltd.
# See LICENSE file for licensing details.
from typing import List, Optional, Tuple
from typing import Dict, List, Optional, Tuple
from urllib.parse import quote_plus

from pymongo import MongoClient
from pytest_operator.plugin import OpsTest
from tenacity import Retrying, stop_after_attempt, wait_fixed

from ..helpers import get_password
from ..relation_tests.new_relations.helpers import (
get_application_relation_data,
get_secret_data,
)

TIMEOUT = 10 * 60
MONGOS_PORT = 27018
MONGOD_PORT = 27017
SHARD_ONE_APP_NAME = "shard-one"
SHARD_TWO_APP_NAME = "shard-two"
CONFIG_SERVER_APP_NAME = "config-server"
CLUSTER_COMPONENTS = [SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME, CONFIG_SERVER_APP_NAME]
CONFIG_SERVER_REL_NAME = "config-server"
SHARD_REL_NAME = "sharding"


async def generate_mongodb_client(
Expand Down Expand Up @@ -107,3 +115,68 @@ def count_users(mongos_client: MongoClient) -> int:
admin_db = mongos_client["admin"]
users_collection = admin_db.system.users
return users_collection.count_documents({})


async def deploy_cluster_components(
ops_test: OpsTest, num_units_cluster_config: Dict = None
) -> None:
if not num_units_cluster_config:
num_units_cluster_config = {
CONFIG_SERVER_APP_NAME: 2,
SHARD_ONE_APP_NAME: 3,
SHARD_TWO_APP_NAME: 1,
}

my_charm = await ops_test.build_charm(".")
await ops_test.model.deploy(
my_charm,
num_units=num_units_cluster_config[CONFIG_SERVER_APP_NAME],
config={"role": "config-server"},
application_name=CONFIG_SERVER_APP_NAME,
)
await ops_test.model.deploy(
my_charm,
num_units=num_units_cluster_config[SHARD_ONE_APP_NAME],
config={"role": "shard"},
application_name=SHARD_ONE_APP_NAME,
)
await ops_test.model.deploy(
my_charm,
num_units=num_units_cluster_config[SHARD_TWO_APP_NAME],
config={"role": "shard"},
application_name=SHARD_TWO_APP_NAME,
)

await ops_test.model.wait_for_idle(
apps=CLUSTER_COMPONENTS,
idle_period=20,
timeout=TIMEOUT,
)


async def destroy_cluster(ops_test):
"""Destroy cluster in a forceful way."""
for app in CLUSTER_COMPONENTS:
await ops_test.model.applications[app].destroy(force=True, no_wait=False)

# destroy does not wait for applications to be removed, perform this check manually
for attempt in Retrying(stop=stop_after_attempt(100), wait=wait_fixed(10), reraise=True):
with attempt:
# pytest_operator has a bug where the number of applications does not get correctly
# updated. Wrapping the call with `fast_forward` resolves this
async with ops_test.fast_forward():
assert (
len(ops_test.model.applications) == 1
), "old cluster not destroyed successfully."


async def integrate_cluster(ops_test: OpsTest) -> None:
"""Integrates the cluster components with each other."""
await ops_test.model.integrate(
f"{SHARD_ONE_APP_NAME}:{SHARD_REL_NAME}",
f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
)
await ops_test.model.integrate(
f"{SHARD_TWO_APP_NAME}:{SHARD_REL_NAME}",
f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
)
13 changes: 11 additions & 2 deletions tests/integration/sharding_tests/test_sharding_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,10 @@ async def test_restore_backup(ops_test: OpsTest, add_writes_to_shards) -> None:
"""Tests that sharded Charmed MongoDB cluster supports restores."""
# count total writes
cluster_writes = await writes_helpers.get_cluster_writes_count(
ops_test, shard_app_names=SHARD_APPS, db_names=[SHARD_ONE_DB_NAME, SHARD_TWO_DB_NAME]
ops_test,
shard_app_names=SHARD_APPS,
db_names=[SHARD_ONE_DB_NAME, SHARD_TWO_DB_NAME],
config_server_name=CONFIG_SERVER_APP_NAME,
)

assert cluster_writes["total_writes"], "no writes to backup"
Expand Down Expand Up @@ -288,6 +291,7 @@ async def test_migrate_restore_backup(ops_test: OpsTest, add_writes_to_shards) -
ops_test,
shard_app_names=SHARD_APPS,
db_names=[SHARD_ONE_DB_NAME, SHARD_TWO_DB_NAME],
config_server_name=CONFIG_SERVER_APP_NAME,
)
assert cluster_writes["total_writes"], "no writes to backup"
assert cluster_writes[SHARD_ONE_APP_NAME], "no writes to backup for shard one"
Expand Down Expand Up @@ -471,7 +475,10 @@ async def add_and_verify_unwanted_writes(ops_test, old_cluster_writes: Dict) ->
content={"horse-breed": "pegasus", "real": True},
)
new_total_writes = await writes_helpers.get_cluster_writes_count(
ops_test, shard_app_names=SHARD_APPS, db_names=[SHARD_ONE_DB_NAME, SHARD_TWO_DB_NAME]
ops_test,
shard_app_names=SHARD_APPS,
db_names=[SHARD_ONE_DB_NAME, SHARD_TWO_DB_NAME],
config_server_name=CONFIG_SERVER_APP_NAME,
)

assert (
Expand All @@ -489,6 +496,7 @@ async def verify_writes_restored(
ops_test, exppected_cluster_writes: Dict, new_names=False
) -> None:
"""Verify that writes were correctly restored."""
config_server_name = CONFIG_SERVER_APP_NAME if not new_names else CONFIG_SERVER_APP_NAME_NEW
shard_one_name = SHARD_ONE_APP_NAME if not new_names else SHARD_ONE_APP_NAME_NEW
shard_two_name = SHARD_TWO_APP_NAME if not new_names else SHARD_TWO_APP_NAME_NEW
shard_apps = [shard_one_name, shard_two_name]
Expand All @@ -500,6 +508,7 @@ async def verify_writes_restored(
ops_test,
shard_app_names=shard_apps,
db_names=[SHARD_ONE_DB_NAME, SHARD_TWO_DB_NAME],
config_server_name=config_server_name,
)
assert (
restored_total_writes["total_writes"] == exppected_cluster_writes["total_writes"]
Expand Down
Loading

0 comments on commit 406e341

Please sign in to comment.