Skip to content

Commit

Permalink
update to latest libs (#365)
Browse files Browse the repository at this point in the history
* update to latest libs

* update libs

* fix HA tests

* fix mongos tests

* fix metrics test

* update channel for revision check

* wait for permissions to be resolved

* wait for model to settle before creating backup

* adding missing return

* chore: remove useless comma

* remove block until

* fix: Use context manager to stop looping forever

* revert earlier change

* Revert "remove block until"

This reverts commit 22446d1.

* fix: Delay setting the partition in case the leader is the unit to upgrade

---------

Co-authored-by: Mehdi-Bendriss <[email protected]>
Co-authored-by: Neha Oudin <[email protected]>
  • Loading branch information
3 people authored Nov 14, 2024
1 parent ab4e9e7 commit 3fc7400
Show file tree
Hide file tree
Showing 14 changed files with 101 additions and 69 deletions.
2 changes: 1 addition & 1 deletion lib/charms/mongodb/v0/config_server_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 14
LIBPATCH = 15


class ClusterProvider(Object):
Expand Down
2 changes: 1 addition & 1 deletion lib/charms/mongodb/v1/mongodb_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 6
LIBPATCH = 5

logger = logging.getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion lib/charms/mongodb/v1/mongodb_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 15
LIBPATCH = 16

logger = logging.getLogger(__name__)
REL_NAME = "database"
Expand Down
24 changes: 13 additions & 11 deletions lib/charms/mongodb/v1/mongodb_tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import logging
import re
import socket
from typing import Dict, List, Optional, Tuple
from typing import Optional, Tuple

from charms.tls_certificates_interface.v3.tls_certificates import (
CertificateAvailableEvent,
Expand Down Expand Up @@ -42,7 +42,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 4
LIBPATCH = 5

WAIT_CERT_UPDATE = "wait-cert-updated"

Expand Down Expand Up @@ -105,9 +105,6 @@ def request_certificate(
internal: bool,
):
"""Request TLS certificate."""
if not self.charm.model.get_relation(Config.TLS.TLS_PEER_RELATION):
return

if param is None:
key = generate_private_key()
else:
Expand Down Expand Up @@ -234,7 +231,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None:
self.charm.cluster.update_ca_secret(new_ca=event.ca)
self.charm.config_server.update_ca_secret(new_ca=event.ca)

if self.waiting_for_both_certs():
if self.is_waiting_for_both_certs():
logger.debug(
"Defer till both internal and external TLS certificates available to avoid second restart."
)
Expand All @@ -256,7 +253,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None:
# clear waiting status if db service is ready
self.charm.status.set_and_share_status(ActiveStatus())

def waiting_for_both_certs(self):
def is_waiting_for_both_certs(self) -> bool:
"""Returns a boolean indicating whether additional certs are needed."""
if not self.get_tls_secret(internal=True, label_name=Config.TLS.SECRET_CERT_LABEL):
logger.debug("Waiting for internal certificate.")
Expand Down Expand Up @@ -295,6 +292,10 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None:
return

logger.debug("Generating a new Certificate Signing Request.")
self.request_new_certificates(internal)

def request_new_certificates(self, internal: bool) -> None:
"""Requests the renewel of a new certificate."""
key = self.get_tls_secret(internal, Config.TLS.SECRET_KEY_LABEL).encode("utf-8")
old_csr = self.get_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL).encode("utf-8")
sans = self.get_new_sans()
Expand All @@ -313,8 +314,9 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None:
)

self.set_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL, new_csr.decode("utf-8"))
self.set_waiting_for_cert_to_update(waiting=True, internal=internal)

def get_new_sans(self) -> Dict:
def get_new_sans(self) -> dict[str, list[str]]:
"""Create a list of DNS names for a MongoDB unit.
Returns:
Expand All @@ -341,7 +343,7 @@ def get_new_sans(self) -> Dict:

return sans

def get_current_sans(self, internal: bool) -> List[str] | None:
def get_current_sans(self, internal: bool) -> dict[str, list[str]] | None:
"""Gets the current SANs for the unit cert."""
# if unit has no certificates do not proceed.
if not self.is_tls_enabled(internal=internal):
Expand Down Expand Up @@ -411,9 +413,9 @@ def _get_subject_name(self) -> str:

def is_set_waiting_for_cert_to_update(
self,
internal=False,
internal: bool = False,
) -> bool:
"""Returns True we are waiting for a cert to update."""
"""Returns True if we are waiting for a cert to update."""
scope = "int" if internal else "ext"
label_name = f"{scope}-{WAIT_CERT_UPDATE}"

Expand Down
4 changes: 2 additions & 2 deletions lib/charms/mongodb/v1/shards_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 11
LIBPATCH = 12

KEYFILE_KEY = "key-file"
HOSTS_KEY = "host"
Expand Down Expand Up @@ -711,7 +711,7 @@ def _on_relation_changed(self, event):

self.update_member_auth(event, (key_file_enabled, tls_enabled))

if tls_enabled and self.charm.tls.waiting_for_both_certs():
if tls_enabled and self.charm.tls.is_waiting_for_both_certs():
logger.info("Waiting for requested certs, before restarting and adding to cluster.")
event.defer()
return
Expand Down
30 changes: 26 additions & 4 deletions lib/charms/tls_certificates_interface/v3/tls_certificates.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def _on_all_certificates_invalidated(self, event: AllCertificatesInvalidatedEven

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 20
LIBPATCH = 23

PYDEPS = ["cryptography", "jsonschema"]

Expand Down Expand Up @@ -1902,10 +1902,20 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None:
)
else:
try:
secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}")
logger.debug(
"Setting secret with label %s", f"{LIBID}-{csr_in_sha256_hex}"
)
secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}")
# Juju < 3.6 will create a new revision even if the content is the same
if (
secret.get_content(refresh=True).get("certificate", "")
== certificate.certificate
):
logger.debug(
"Secret %s with correct certificate already exists",
f"{LIBID}-{csr_in_sha256_hex}",
)
continue
secret.set_content(
{"certificate": certificate.certificate, "csr": certificate.csr}
)
Expand Down Expand Up @@ -1986,11 +1996,19 @@ def _on_secret_expired(self, event: SecretExpiredEvent) -> None:
provider_certificate = self._find_certificate_in_relation_data(csr)
if not provider_certificate:
# A secret expired but we did not find matching certificate. Cleaning up
logger.warning(
"Failed to find matching certificate for csr, cleaning up secret %s",
event.secret.label,
)
event.secret.remove_all_revisions()
return

if not provider_certificate.expiry_time:
# A secret expired but matching certificate is invalid. Cleaning up
logger.warning(
"Certificate matching csr is invalid, cleaning up secret %s",
event.secret.label,
)
event.secret.remove_all_revisions()
return

Expand Down Expand Up @@ -2023,14 +2041,18 @@ def _find_certificate_in_relation_data(self, csr: str) -> Optional[ProviderCerti
return provider_certificate
return None

def _get_csr_from_secret(self, secret: Secret) -> str:
def _get_csr_from_secret(self, secret: Secret) -> Union[str, None]:
"""Extract the CSR from the secret label or content.
This function is a workaround to maintain backwards compatibility
and fix the issue reported in
https://github.com/canonical/tls-certificates-interface/issues/228
"""
if not (csr := secret.get_content().get("csr", "")):
try:
content = secret.get_content(refresh=True)
except SecretNotFoundError:
return None
if not (csr := content.get("csr", None)):
# In versions <14 of the Lib we were storing the CSR in the label of the secret
# The CSR now is stored int the content of the secret, which was a breaking change
# Here we get the CSR if the secret was created by an app using libpatch 14 or lower
Expand Down
10 changes: 7 additions & 3 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,15 +636,17 @@ def _configure_container(self, container: Container) -> None:
except FailedToUpdateFilesystem as err:
raise ContainerNotReadyError from err

self._configure_layers(container)

# when a network cuts and the pod restarts - reconnect to the exporter
try:
self._configure_layers(container)
# when a network cuts and the pod restarts - reconnect to the exporter and pbm
self._connect_mongodb_exporter()
self._connect_pbm_agent()
except MissingSecretError as e:
logger.error("Cannot connect mongodb exporter: %r", e)
raise ContainerNotReadyError
except ChangeError as e:
logger.error("Cannot configure container layers %r", e)
raise ContainerNotReadyError

# BEGIN: charm events
def _on_upgrade(self, event: UpgradeCharmEvent) -> None:
Expand Down Expand Up @@ -927,6 +929,8 @@ def mongodb_storage_detaching(self, event) -> None:
self.shard.wait_for_draining(mongos_hosts)
logger.info("Shard successfully drained storage.")

return

try:
# retries over a period of 10 minutes in an attempt to resolve race conditions it is
logger.debug("Removing %s from replica set", self.unit_host(self.unit))
Expand Down
45 changes: 24 additions & 21 deletions src/upgrades/kubernetes_upgrades.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,33 +224,36 @@ def reconcile_partition(
# This does not address the situation where another unit > 1 restarts and sets the
# partition during the `stop` event, but that is unlikely to occur in the small time window
# that causes the unit to hang.
if partition_ < self._partition:
self._partition = partition_
logger.debug(
f"Lowered partition to {partition_} {action_event=} {force=} {self.in_progress=}"
)
if action_event:
assert len(units) >= 2
if self._partition > unit_number(units[1]):
if partition_ > unit_number(units[1]):
message = "Highest number unit is unhealthy. Refresh will not resume."
logger.debug(f"Resume upgrade event failed: {message}")
action_event.fail(message)
return
if force:
# If a unit was unhealthy and the upgrade was forced, only the next unit will
# upgrade. As long as 1 or more units are unhealthy, the upgrade will need to be
# forced for each unit.

# Include "Attempting to" because (on Kubernetes) we only control the partition,
# not which units upgrade. Kubernetes may not upgrade a unit even if the partition
# allows it (e.g. if the charm container of a higher unit is not ready). This is
# also applicable `if not force`, but is unlikely to happen since all units are
# healthy `if not force`.
message = f"Attempting to refresh unit {self._partition}."
else:
message = f"Refresh resumed. Unit {self._partition} is refreshing next."
action_event.set_results({"result": message})
logger.debug(f"Resume refresh succeeded: {message}")
if force:
# If a unit was unhealthy and the upgrade was forced, only
# the next unit will upgrade. As long as 1 or more units
# are unhealthy, the upgrade will need to be forced for
# each unit.

# Include "Attempting to" because (on Kubernetes) we only
# control the partition, not which units upgrade.
# Kubernetes may not upgrade a unit even if the partition
# allows it (e.g. if the charm container of a higher unit
# is not ready). This is also applicable `if not force`,
# but is unlikely to happen since all units are healthy `if
# not force`.
message = f"Attempting to refresh unit {self._partition}."
else:
message = f"Refresh resumed. Unit {self._partition} is refreshing next."
action_event.set_results({"result": message})
logger.debug(f"Resume refresh succeeded: {message}")
if partition_ < self._partition:
self._partition = partition_
logger.debug(
f"Lowered partition to {partition_} {action_event=} {force=} {self.in_progress=}"
)


partition = _Partition()
31 changes: 13 additions & 18 deletions tests/integration/backup_tests/test_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,7 @@
import pytest_asyncio
import yaml
from pytest_operator.plugin import OpsTest
from tenacity import (
RetryError,
Retrying,
stop_after_attempt,
stop_after_delay,
wait_fixed,
)
from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed

from ..ha_tests import helpers as ha_helpers
from ..helpers import (
Expand Down Expand Up @@ -244,7 +238,7 @@ async def test_multi_backup(ops_test: OpsTest, github_secrets, continuous_writes
db_unit = await helpers.get_leader_unit(ops_test)

# create first backup once ready
await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20),
await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20)

action = await db_unit.run_action(action_name="create-backup")
first_backup = await action.wait()
Expand All @@ -262,7 +256,7 @@ async def test_multi_backup(ops_test: OpsTest, github_secrets, continuous_writes
}
await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters)

await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20),
await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20)

# create a backup as soon as possible. might not be immediately possible since only one backup
# can happen at a time.
Expand All @@ -279,7 +273,7 @@ async def test_multi_backup(ops_test: OpsTest, github_secrets, continuous_writes
# backup can take a lot of time so this function returns once the command was successfully
# sent to pbm. Therefore before checking, wait for Charmed MongoDB to finish creating the
# backup
await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20),
await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20)

# verify that backups was made in GCP bucket
try:
Expand All @@ -298,7 +292,7 @@ async def test_multi_backup(ops_test: OpsTest, github_secrets, continuous_writes
"endpoint": "https://s3.amazonaws.com",
}
await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters)
await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20),
await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20)

# verify that backups was made on the AWS bucket
try:
Expand Down Expand Up @@ -448,13 +442,14 @@ async def test_restore_new_cluster(
), "Backups from old cluster are listed as failed"

# find most recent backup id and restore
for attempt in Retrying(stop=stop_after_attempt(120), wait=wait_fixed(1), reraise=True):
action = await leader_unit.run_action(action_name="list-backups")
list_result = await action.wait()
list_result = list_result.results["backups"]
most_recent_backup = list_result.split("\n")[-1]
backup_id = most_recent_backup.split()[0]
assert "-----" not in backup_id, "list of backups are empty."
action = await leader_unit.run_action(action_name="list-backups")
list_result = await action.wait()
list_result = list_result.results["backups"]
most_recent_backup = list_result.split("\n")[-1]
backup_id = most_recent_backup.split()[0]
action = await leader_unit.run_action(action_name="restore", **{"backup-id": backup_id})
restore = await action.wait()
assert restore.results["restore-status"] == "restore started", "restore not successful"

# verify all writes are present
try:
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/backup_tests/test_sharding_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ async def test_set_credentials_in_cluster(ops_test: OpsTest, github_secrets) ->

# apply new configuration options
await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters)
await ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active", timeout=TIMEOUT)
await ops_test.model.wait_for_idle(apps=CLUSTER_APPS, status="active", timeout=TIMEOUT)
await setup_cluster_and_s3(ops_test)


Expand Down
4 changes: 3 additions & 1 deletion tests/integration/ha_tests/test_ha.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,9 @@ async def test_network_cut(ops_test: OpsTest, continuous_writes, chaos_mesh):

# we need to give juju some time to realize that the instance is back online
time.sleep(RESTART_DELAY)

await ops_test.model.wait_for_idle(
apps=[app], status="active", raise_on_blocked=False, timeout=1000
)
await wait_until_unit_in_status(ops_test, primary, active_unit, "SECONDARY")

# verify presence of primary, replica set member configuration, and number of primaries
Expand Down
3 changes: 3 additions & 0 deletions tests/integration/metrics_tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ async def test_endpoints_network_cut(ops_test: OpsTest, chaos_mesh):
time.sleep(60)

# Wait for the network to be restored
await ops_test.model.wait_for_idle(
apps=[app_name], status="active", raise_on_blocked=False, timeout=1000
)
await ha_helpers.wait_until_unit_in_status(ops_test, primary, active_unit, "SECONDARY")

for unit in ops_test.model.applications[app_name].units:
Expand Down
Loading

0 comments on commit 3fc7400

Please sign in to comment.