diff --git a/lib/charms/mongodb/v0/config_server_interface.py b/lib/charms/mongodb/v0/config_server_interface.py index 44e485bbb..b005b80eb 100644 --- a/lib/charms/mongodb/v0/config_server_interface.py +++ b/lib/charms/mongodb/v0/config_server_interface.py @@ -51,7 +51,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 14 +LIBPATCH = 15 class ClusterProvider(Object): diff --git a/lib/charms/mongodb/v1/mongodb_backups.py b/lib/charms/mongodb/v1/mongodb_backups.py index 559242f67..b1d9a0dc6 100644 --- a/lib/charms/mongodb/v1/mongodb_backups.py +++ b/lib/charms/mongodb/v1/mongodb_backups.py @@ -41,7 +41,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 6 +LIBPATCH = 5 logger = logging.getLogger(__name__) diff --git a/lib/charms/mongodb/v1/mongodb_provider.py b/lib/charms/mongodb/v1/mongodb_provider.py index ab21e75cf..0e150b256 100644 --- a/lib/charms/mongodb/v1/mongodb_provider.py +++ b/lib/charms/mongodb/v1/mongodb_provider.py @@ -37,7 +37,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 15 +LIBPATCH = 16 logger = logging.getLogger(__name__) REL_NAME = "database" diff --git a/lib/charms/mongodb/v1/mongodb_tls.py b/lib/charms/mongodb/v1/mongodb_tls.py index 8f00bde7b..cfb0a7398 100644 --- a/lib/charms/mongodb/v1/mongodb_tls.py +++ b/lib/charms/mongodb/v1/mongodb_tls.py @@ -12,7 +12,7 @@ import logging import re import socket -from typing import Dict, List, Optional, Tuple +from typing import Optional, Tuple from charms.tls_certificates_interface.v3.tls_certificates import ( CertificateAvailableEvent, @@ -42,7 +42,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 4 +LIBPATCH = 5 WAIT_CERT_UPDATE = "wait-cert-updated" @@ -105,9 +105,6 @@ def request_certificate( internal: bool, ): """Request TLS certificate.""" - if not self.charm.model.get_relation(Config.TLS.TLS_PEER_RELATION): - return - if param is None: key = generate_private_key() else: @@ -234,7 +231,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: self.charm.cluster.update_ca_secret(new_ca=event.ca) self.charm.config_server.update_ca_secret(new_ca=event.ca) - if self.waiting_for_both_certs(): + if self.is_waiting_for_both_certs(): logger.debug( "Defer till both internal and external TLS certificates available to avoid second restart." ) @@ -256,7 +253,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: # clear waiting status if db service is ready self.charm.status.set_and_share_status(ActiveStatus()) - def waiting_for_both_certs(self): + def is_waiting_for_both_certs(self) -> bool: """Returns a boolean indicating whether additional certs are needed.""" if not self.get_tls_secret(internal=True, label_name=Config.TLS.SECRET_CERT_LABEL): logger.debug("Waiting for internal certificate.") @@ -295,6 +292,10 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None: return logger.debug("Generating a new Certificate Signing Request.") + self.request_new_certificates(internal) + + def request_new_certificates(self, internal: bool) -> None: + """Requests the renewel of a new certificate.""" key = self.get_tls_secret(internal, Config.TLS.SECRET_KEY_LABEL).encode("utf-8") old_csr = self.get_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL).encode("utf-8") sans = self.get_new_sans() @@ -313,8 +314,9 @@ def _on_certificate_expiring(self, event: CertificateExpiringEvent) -> None: ) self.set_tls_secret(internal, Config.TLS.SECRET_CSR_LABEL, new_csr.decode("utf-8")) + self.set_waiting_for_cert_to_update(waiting=True, internal=internal) - def get_new_sans(self) -> Dict: + def get_new_sans(self) -> dict[str, list[str]]: """Create a list of DNS names for a MongoDB unit. Returns: @@ -341,7 +343,7 @@ def get_new_sans(self) -> Dict: return sans - def get_current_sans(self, internal: bool) -> List[str] | None: + def get_current_sans(self, internal: bool) -> dict[str, list[str]] | None: """Gets the current SANs for the unit cert.""" # if unit has no certificates do not proceed. if not self.is_tls_enabled(internal=internal): @@ -411,9 +413,9 @@ def _get_subject_name(self) -> str: def is_set_waiting_for_cert_to_update( self, - internal=False, + internal: bool = False, ) -> bool: - """Returns True we are waiting for a cert to update.""" + """Returns True if we are waiting for a cert to update.""" scope = "int" if internal else "ext" label_name = f"{scope}-{WAIT_CERT_UPDATE}" diff --git a/lib/charms/mongodb/v1/shards_interface.py b/lib/charms/mongodb/v1/shards_interface.py index ae6f9e8f1..25c9f7641 100644 --- a/lib/charms/mongodb/v1/shards_interface.py +++ b/lib/charms/mongodb/v1/shards_interface.py @@ -58,7 +58,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 11 +LIBPATCH = 12 KEYFILE_KEY = "key-file" HOSTS_KEY = "host" @@ -711,7 +711,7 @@ def _on_relation_changed(self, event): self.update_member_auth(event, (key_file_enabled, tls_enabled)) - if tls_enabled and self.charm.tls.waiting_for_both_certs(): + if tls_enabled and self.charm.tls.is_waiting_for_both_certs(): logger.info("Waiting for requested certs, before restarting and adding to cluster.") event.defer() return diff --git a/lib/charms/tls_certificates_interface/v3/tls_certificates.py b/lib/charms/tls_certificates_interface/v3/tls_certificates.py index da7fa95e6..141412b00 100644 --- a/lib/charms/tls_certificates_interface/v3/tls_certificates.py +++ b/lib/charms/tls_certificates_interface/v3/tls_certificates.py @@ -318,7 +318,7 @@ def _on_all_certificates_invalidated(self, event: AllCertificatesInvalidatedEven # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 20 +LIBPATCH = 23 PYDEPS = ["cryptography", "jsonschema"] @@ -1902,10 +1902,20 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None: ) else: try: + secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}") logger.debug( "Setting secret with label %s", f"{LIBID}-{csr_in_sha256_hex}" ) - secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}") + # Juju < 3.6 will create a new revision even if the content is the same + if ( + secret.get_content(refresh=True).get("certificate", "") + == certificate.certificate + ): + logger.debug( + "Secret %s with correct certificate already exists", + f"{LIBID}-{csr_in_sha256_hex}", + ) + continue secret.set_content( {"certificate": certificate.certificate, "csr": certificate.csr} ) @@ -1986,11 +1996,19 @@ def _on_secret_expired(self, event: SecretExpiredEvent) -> None: provider_certificate = self._find_certificate_in_relation_data(csr) if not provider_certificate: # A secret expired but we did not find matching certificate. Cleaning up + logger.warning( + "Failed to find matching certificate for csr, cleaning up secret %s", + event.secret.label, + ) event.secret.remove_all_revisions() return if not provider_certificate.expiry_time: # A secret expired but matching certificate is invalid. Cleaning up + logger.warning( + "Certificate matching csr is invalid, cleaning up secret %s", + event.secret.label, + ) event.secret.remove_all_revisions() return @@ -2023,14 +2041,18 @@ def _find_certificate_in_relation_data(self, csr: str) -> Optional[ProviderCerti return provider_certificate return None - def _get_csr_from_secret(self, secret: Secret) -> str: + def _get_csr_from_secret(self, secret: Secret) -> Union[str, None]: """Extract the CSR from the secret label or content. This function is a workaround to maintain backwards compatibility and fix the issue reported in https://github.com/canonical/tls-certificates-interface/issues/228 """ - if not (csr := secret.get_content().get("csr", "")): + try: + content = secret.get_content(refresh=True) + except SecretNotFoundError: + return None + if not (csr := content.get("csr", None)): # In versions <14 of the Lib we were storing the CSR in the label of the secret # The CSR now is stored int the content of the secret, which was a breaking change # Here we get the CSR if the secret was created by an app using libpatch 14 or lower diff --git a/src/charm.py b/src/charm.py index 5cd2ee13c..08669e122 100755 --- a/src/charm.py +++ b/src/charm.py @@ -636,15 +636,17 @@ def _configure_container(self, container: Container) -> None: except FailedToUpdateFilesystem as err: raise ContainerNotReadyError from err - self._configure_layers(container) - - # when a network cuts and the pod restarts - reconnect to the exporter try: + self._configure_layers(container) + # when a network cuts and the pod restarts - reconnect to the exporter and pbm self._connect_mongodb_exporter() self._connect_pbm_agent() except MissingSecretError as e: logger.error("Cannot connect mongodb exporter: %r", e) raise ContainerNotReadyError + except ChangeError as e: + logger.error("Cannot configure container layers %r", e) + raise ContainerNotReadyError # BEGIN: charm events def _on_upgrade(self, event: UpgradeCharmEvent) -> None: @@ -927,6 +929,8 @@ def mongodb_storage_detaching(self, event) -> None: self.shard.wait_for_draining(mongos_hosts) logger.info("Shard successfully drained storage.") + return + try: # retries over a period of 10 minutes in an attempt to resolve race conditions it is logger.debug("Removing %s from replica set", self.unit_host(self.unit)) diff --git a/src/upgrades/kubernetes_upgrades.py b/src/upgrades/kubernetes_upgrades.py index 1ba92f478..ac8309b00 100644 --- a/src/upgrades/kubernetes_upgrades.py +++ b/src/upgrades/kubernetes_upgrades.py @@ -224,33 +224,36 @@ def reconcile_partition( # This does not address the situation where another unit > 1 restarts and sets the # partition during the `stop` event, but that is unlikely to occur in the small time window # that causes the unit to hang. - if partition_ < self._partition: - self._partition = partition_ - logger.debug( - f"Lowered partition to {partition_} {action_event=} {force=} {self.in_progress=}" - ) if action_event: assert len(units) >= 2 - if self._partition > unit_number(units[1]): + if partition_ > unit_number(units[1]): message = "Highest number unit is unhealthy. Refresh will not resume." logger.debug(f"Resume upgrade event failed: {message}") action_event.fail(message) - return - if force: - # If a unit was unhealthy and the upgrade was forced, only the next unit will - # upgrade. As long as 1 or more units are unhealthy, the upgrade will need to be - # forced for each unit. - - # Include "Attempting to" because (on Kubernetes) we only control the partition, - # not which units upgrade. Kubernetes may not upgrade a unit even if the partition - # allows it (e.g. if the charm container of a higher unit is not ready). This is - # also applicable `if not force`, but is unlikely to happen since all units are - # healthy `if not force`. - message = f"Attempting to refresh unit {self._partition}." else: - message = f"Refresh resumed. Unit {self._partition} is refreshing next." - action_event.set_results({"result": message}) - logger.debug(f"Resume refresh succeeded: {message}") + if force: + # If a unit was unhealthy and the upgrade was forced, only + # the next unit will upgrade. As long as 1 or more units + # are unhealthy, the upgrade will need to be forced for + # each unit. + + # Include "Attempting to" because (on Kubernetes) we only + # control the partition, not which units upgrade. + # Kubernetes may not upgrade a unit even if the partition + # allows it (e.g. if the charm container of a higher unit + # is not ready). This is also applicable `if not force`, + # but is unlikely to happen since all units are healthy `if + # not force`. + message = f"Attempting to refresh unit {self._partition}." + else: + message = f"Refresh resumed. Unit {self._partition} is refreshing next." + action_event.set_results({"result": message}) + logger.debug(f"Resume refresh succeeded: {message}") + if partition_ < self._partition: + self._partition = partition_ + logger.debug( + f"Lowered partition to {partition_} {action_event=} {force=} {self.in_progress=}" + ) partition = _Partition() diff --git a/tests/integration/backup_tests/test_backups.py b/tests/integration/backup_tests/test_backups.py index f0a80c325..75aba1ee2 100644 --- a/tests/integration/backup_tests/test_backups.py +++ b/tests/integration/backup_tests/test_backups.py @@ -12,13 +12,7 @@ import pytest_asyncio import yaml from pytest_operator.plugin import OpsTest -from tenacity import ( - RetryError, - Retrying, - stop_after_attempt, - stop_after_delay, - wait_fixed, -) +from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed from ..ha_tests import helpers as ha_helpers from ..helpers import ( @@ -244,7 +238,7 @@ async def test_multi_backup(ops_test: OpsTest, github_secrets, continuous_writes db_unit = await helpers.get_leader_unit(ops_test) # create first backup once ready - await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), + await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20) action = await db_unit.run_action(action_name="create-backup") first_backup = await action.wait() @@ -262,7 +256,7 @@ async def test_multi_backup(ops_test: OpsTest, github_secrets, continuous_writes } await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) - await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), + await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20) # create a backup as soon as possible. might not be immediately possible since only one backup # can happen at a time. @@ -279,7 +273,7 @@ async def test_multi_backup(ops_test: OpsTest, github_secrets, continuous_writes # backup can take a lot of time so this function returns once the command was successfully # sent to pbm. Therefore before checking, wait for Charmed MongoDB to finish creating the # backup - await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), + await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20) # verify that backups was made in GCP bucket try: @@ -298,7 +292,7 @@ async def test_multi_backup(ops_test: OpsTest, github_secrets, continuous_writes "endpoint": "https://s3.amazonaws.com", } await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) - await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20), + await ops_test.model.wait_for_idle(apps=[db_app_name], status="active", idle_period=20) # verify that backups was made on the AWS bucket try: @@ -448,13 +442,14 @@ async def test_restore_new_cluster( ), "Backups from old cluster are listed as failed" # find most recent backup id and restore - for attempt in Retrying(stop=stop_after_attempt(120), wait=wait_fixed(1), reraise=True): - action = await leader_unit.run_action(action_name="list-backups") - list_result = await action.wait() - list_result = list_result.results["backups"] - most_recent_backup = list_result.split("\n")[-1] - backup_id = most_recent_backup.split()[0] - assert "-----" not in backup_id, "list of backups are empty." + action = await leader_unit.run_action(action_name="list-backups") + list_result = await action.wait() + list_result = list_result.results["backups"] + most_recent_backup = list_result.split("\n")[-1] + backup_id = most_recent_backup.split()[0] + action = await leader_unit.run_action(action_name="restore", **{"backup-id": backup_id}) + restore = await action.wait() + assert restore.results["restore-status"] == "restore started", "restore not successful" # verify all writes are present try: diff --git a/tests/integration/backup_tests/test_sharding_backups.py b/tests/integration/backup_tests/test_sharding_backups.py index 9dd60e6e0..c1279487b 100644 --- a/tests/integration/backup_tests/test_sharding_backups.py +++ b/tests/integration/backup_tests/test_sharding_backups.py @@ -107,7 +107,7 @@ async def test_set_credentials_in_cluster(ops_test: OpsTest, github_secrets) -> # apply new configuration options await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters) - await ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active", timeout=TIMEOUT) + await ops_test.model.wait_for_idle(apps=CLUSTER_APPS, status="active", timeout=TIMEOUT) await setup_cluster_and_s3(ops_test) diff --git a/tests/integration/ha_tests/test_ha.py b/tests/integration/ha_tests/test_ha.py index 322fb3e72..d13eb76f4 100644 --- a/tests/integration/ha_tests/test_ha.py +++ b/tests/integration/ha_tests/test_ha.py @@ -596,7 +596,9 @@ async def test_network_cut(ops_test: OpsTest, continuous_writes, chaos_mesh): # we need to give juju some time to realize that the instance is back online time.sleep(RESTART_DELAY) - + await ops_test.model.wait_for_idle( + apps=[app], status="active", raise_on_blocked=False, timeout=1000 + ) await wait_until_unit_in_status(ops_test, primary, active_unit, "SECONDARY") # verify presence of primary, replica set member configuration, and number of primaries diff --git a/tests/integration/metrics_tests/test_metrics.py b/tests/integration/metrics_tests/test_metrics.py index fc2d6a751..0ace81050 100644 --- a/tests/integration/metrics_tests/test_metrics.py +++ b/tests/integration/metrics_tests/test_metrics.py @@ -124,6 +124,9 @@ async def test_endpoints_network_cut(ops_test: OpsTest, chaos_mesh): time.sleep(60) # Wait for the network to be restored + await ops_test.model.wait_for_idle( + apps=[app_name], status="active", raise_on_blocked=False, timeout=1000 + ) await ha_helpers.wait_until_unit_in_status(ops_test, primary, active_unit, "SECONDARY") for unit in ops_test.model.applications[app_name].units: diff --git a/tests/integration/sharding_tests/test_mongos.py b/tests/integration/sharding_tests/test_mongos.py index a065b190a..e81eddc0a 100644 --- a/tests/integration/sharding_tests/test_mongos.py +++ b/tests/integration/sharding_tests/test_mongos.py @@ -147,11 +147,12 @@ async def test_disconnect_from_cluster_removes_user(ops_test: OpsTest) -> None: f"{CONFIG_SERVER_APP_NAME}:cluster", ) await ops_test.model.wait_for_idle( - apps=[CONFIG_SERVER_APP_NAME, MONGOS_APP_NAME], - idle_period=20, + apps=[CONFIG_SERVER_APP_NAME], + status="active", + idle_period=30, timeout=TIMEOUT, - raise_on_error=False, ) + num_users_after_removal = count_users(mongos_client) for attempt in tenacity.Retrying( diff --git a/tests/integration/upgrades/test_revision_check.py b/tests/integration/upgrades/test_revision_check.py index c0f697f6f..f643dfa55 100644 --- a/tests/integration/upgrades/test_revision_check.py +++ b/tests/integration/upgrades/test_revision_check.py @@ -33,14 +33,14 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None: MONGODB_K8S_CHARM, application_name=REMOTE_SHARD_APP_NAME, config={"role": "shard"}, - channel="edge", + channel="6/edge", ) await ops_test.model.deploy( MONGODB_K8S_CHARM, application_name=REMOTE_CONFIG_SERVER_APP_NAME, config={"role": "config-server"}, - channel="edge", + channel="6/edge", ) await ops_test.model.deploy( my_charm,