Skip to content

Commit

Permalink
[DPE-3839] restoring backups with name mismatch (#381)
Browse files Browse the repository at this point in the history
## Issue
Migrating a cluster to a cluster with a new name is not supported

## Solution
support that via passing an option for it
  • Loading branch information
MiaAltieri authored Mar 21, 2024
1 parent c2f1c57 commit 187c491
Show file tree
Hide file tree
Showing 5 changed files with 151 additions and 82 deletions.
8 changes: 7 additions & 1 deletion actions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ get-primary:
description: Report primary replica

get-password:
description: Fetch the password of the provided internal user of the charm, used for internal charm operations.
description:
Fetch the password of the provided internal user of the charm, used for internal charm operations.
It is for internal charm users only, and SHOULD NOT be used by applications.
params:
username:
Expand Down Expand Up @@ -39,6 +40,11 @@ restore:
backup-id:
type: string
description: A backup-id to identify the backup to restore. Format of <%Y-%m-%dT%H:%M:%SZ>
remap-pattern:
type: string
description:
Optional, a pattern used to remap cluster component names when performing a restore.
Format of old_config_server_name=new_config_server_name,old_shard_name=new_shard_name

set-tls-private-key:
description: Set the privates key, which will be used for certificate signing requests (CSR). Run for each unit separately.
Expand Down
120 changes: 77 additions & 43 deletions lib/charms/mongodb/v1/mongodb_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 4
LIBPATCH = 5

logger = logging.getLogger(__name__)

Expand All @@ -54,7 +54,7 @@
"storage-class": "storage.s3.storageClass",
}
S3_RELATION = "s3-credentials"
REMAPPING_PATTERN = r"\ABackup doesn't match current cluster topology - it has different replica set names. Extra shards in the backup will cause this, for a simple example. The extra/unknown replica set names found in the backup are: ([^,\s]+)([.] Backup has no data for the config server or sole replicaset)?\Z"
REMAPPING_PATTERN = r"\ABackup doesn't match current cluster topology - it has different replica set names. Extra shards in the backup will cause this, for a simple example. The extra/unknown replica set names found in the backup are: ([\w\d\-,\s]+)([.] Backup has no data for the config server or sole replicaset)?\Z"
PBM_STATUS_CMD = ["status", "-o", "json"]
MONGODB_SNAP_DATA_DIR = "/var/snap/charmed-mongodb/current"
BACKUP_RESTORE_MAX_ATTEMPTS = 10
Expand Down Expand Up @@ -141,13 +141,13 @@ def _on_s3_credential_changed(self, event: CredentialsChangedEvent):
return

if not self.charm.db_initialised:
self._defer_action_with_info_log(
self._defer_event_with_info_log(
event, action, "Set PBM credentials, MongoDB not ready."
)
return

if not self.charm.has_backup_service():
self._defer_action_with_info_log(
self._defer_event_with_info_log(
event, action, "Set PBM configurations, pbm-agent service not found."
)
return
Expand Down Expand Up @@ -181,7 +181,7 @@ def _on_create_backup_action(self, event) -> None:
return

if isinstance(pbm_status, WaitingStatus):
self._defer_action_with_info_log(
self._fail_action_with_error_log(
event,
action,
"Sync-ing configurations needs more time, must wait before creating a backup.",
Expand Down Expand Up @@ -215,7 +215,7 @@ def _on_list_backups_action(self, event) -> None:
self.charm.unit.status = pbm_status

if isinstance(pbm_status, WaitingStatus):
self._defer_action_with_info_log(
self._fail_action_with_error_log(
event,
action,
"Sync-ing configurations needs more time, must wait before listing backups.",
Expand All @@ -238,18 +238,40 @@ def _on_restore_action(self, event) -> None:
if not self._pass_sanity_checks(event, action):
return

if not self._restore_hook_checks(event):
return

# sometimes when we are trying to restore pmb can be resyncing, so we need to retry
try:
backup_id = event.params.get("backup-id")
self._restore(backup_id, remapping_args=event.params.get("remap-pattern"))
self.charm.unit.status = MaintenanceStatus(
f"restore started/running, backup id:'{backup_id}'"
)
self._success_action_with_info_log(
event, action, {"restore-status": "restore started"}
)
except ResyncError:
raise
except RestoreError as restore_error:
self._fail_action_with_error_log(event, action, str(restore_error))

# BEGIN: helper functions
def _restore_hook_checks(self, event) -> bool:
"""Runs pre-hook checks specific to running the restore command."""
action = "restore"
backup_id = event.params.get("backup-id")
if not backup_id:
self._fail_action_with_error_log(event, action, "Missing backup-id to restore")
return
return False

# only leader can restore backups. This prevents multiple restores from being attempted at
# once.
if not self.charm.unit.is_leader():
self._fail_action_with_error_log(
event, action, "The action can be run only on leader unit."
)
return
return False

# cannot restore backup if pbm is not ready. This could be due to: resyncing, incompatible,
# options, incorrect credentials, creating a backup, or already performing a restore.
Expand All @@ -259,37 +281,33 @@ def _on_restore_action(self, event) -> None:
self._fail_action_with_error_log(
event, action, "Please wait for current backup/restore to finish."
)
return
return False

if isinstance(pbm_status, WaitingStatus):
self._defer_action_with_info_log(
self._fail_action_with_error_log(
event,
action,
"Sync-ing configurations needs more time, must wait before restoring.",
)
return
return False

if isinstance(pbm_status, BlockedStatus):
self._fail_action_with_error_log(
event, action, f"Cannot restore backup {pbm_status.message}."
)
return
return False

# sometimes when we are trying to restore pmb can be resyncing, so we need to retry
try:
self._try_to_restore(backup_id)
self.charm.unit.status = MaintenanceStatus(
f"restore started/running, backup id:'{backup_id}'"
)
self._success_action_with_info_log(
event, action, {"restore-status": "restore started"}
if (
self._needs_provided_remap_arguments(backup_id)
and event.params.get("remap-pattern") is None
):
self._fail_action_with_error_log(
event, action, "Cannot restore backup, 'remap-pattern' must be set."
)
except ResyncError:
raise
except RestoreError as restore_error:
self._fail_action_with_error_log(event, action, str(restore_error))
return False

return True

# BEGIN: helper functions
def is_valid_s3_integration(self) -> bool:
"""Return true if relation to s3-integrator is valid.
Expand Down Expand Up @@ -337,13 +355,13 @@ def _configure_pbm_options(self, event) -> None:
return
except ResyncError:
self.charm.unit.status = WaitingStatus("waiting to sync s3 configurations.")
self._defer_action_with_info_log(
self._defer_event_with_info_log(
event, action, "Sync-ing configurations needs more time."
)
return
except PBMBusyError:
self.charm.unit.status = WaitingStatus("waiting to sync s3 configurations.")
self._defer_action_with_info_log(
self._defer_event_with_info_log(
event,
action,
"Cannot update configs while PBM is running, must wait for PBM action to finish.",
Expand Down Expand Up @@ -496,7 +514,7 @@ def _generate_backup_list_output(self) -> str:
if backup["status"] == "error":
# backups from a different cluster have an error status, but they should show as
# finished
if self._backup_from_different_cluster(backup.get("error", "")):
if self._is_backup_from_different_cluster(backup.get("error", "")):
backup_status = "finished"
else:
# display reason for failure if available
Expand Down Expand Up @@ -532,11 +550,11 @@ def _format_backup_list(self, backup_list: List[str]) -> str:

return "\n".join(backups)

def _backup_from_different_cluster(self, backup_status: str) -> bool:
def _is_backup_from_different_cluster(self, backup_status: str) -> bool:
"""Returns if a given backup was made on a different cluster."""
return re.search(REMAPPING_PATTERN, backup_status) is not None

def _try_to_restore(self, backup_id: str) -> None:
def _restore(self, backup_id: str, remapping_args: Optional[str] = None) -> None:
"""Try to restore cluster a backup specified by backup id.
If PBM is resyncing, the function will retry to create backup
Expand All @@ -553,7 +571,10 @@ def _try_to_restore(self, backup_id: str) -> None:
):
with attempt:
try:
remapping_args = self._remap_replicaset(backup_id)
remapping_args = remapping_args or self._remap_replicaset(backup_id)
if remapping_args:
remapping_args = f"--replset-remapping {remapping_args}"

restore_cmd = ["restore", backup_id]
if remapping_args:
restore_cmd = restore_cmd + remapping_args.split(" ")
Expand Down Expand Up @@ -619,34 +640,27 @@ def _remap_replicaset(self, backup_id: str) -> str:
pbm_status = json.loads(pbm_status)

# grab the error status from the backup if present
backups = pbm_status["backups"]["snapshot"] or []
backup_status = ""
for backup in backups:
if not backup_id == backup["name"]:
continue
backup_error_status = self.get_backup_error_status(backup_id)

backup_status = backup.get("error", "")
break

if not self._backup_from_different_cluster(backup_status):
if not self._is_backup_from_different_cluster(backup_error_status):
return ""

# TODO in the future when we support conf servers and shards this will need to be more
# comprehensive.
old_cluster_name = re.search(REMAPPING_PATTERN, backup_status).group(1)
old_cluster_name = re.search(REMAPPING_PATTERN, backup_error_status).group(1)
current_cluster_name = self.charm.app.name
logger.debug(
"Replica set remapping is necessary for restore, old cluster name: %s ; new cluster name: %s",
old_cluster_name,
current_cluster_name,
)
return f"--replset-remapping {current_cluster_name}={old_cluster_name}"
return f"{current_cluster_name}={old_cluster_name}"

def _fail_action_with_error_log(self, event, action: str, message: str) -> None:
logger.error("%s failed: %s", action.capitalize(), message)
event.fail(message)

def _defer_action_with_info_log(self, event, action: str, message: str) -> None:
def _defer_event_with_info_log(self, event, action: str, message: str) -> None:
logger.info("Deferring %s: %s", action, message)
event.defer()

Expand Down Expand Up @@ -733,3 +747,23 @@ def process_pbm_error(self, pbm_status: Optional[_StrOrBytes]) -> str:
elif "status code: 301" in error_message:
message = "s3 configurations are incompatible."
return message

def _needs_provided_remap_arguments(self, backup_id: str) -> bool:
"""Returns true if remap arguments are needed to perform a restore command."""
backup_error_status = self.get_backup_error_status(backup_id)

# When a charm is running as a Replica set it can generate its own remapping arguments
return self._is_backup_from_different_cluster(backup_error_status) and self.charm.is_role(
Config.Role.CONFIG_SERVER
)

def get_backup_error_status(self, backup_id: str) -> str:
"""Get the error status for a provided backup."""
pbm_status = self.charm.run_pbm_command(["status", "--out=json"])
pbm_status = json.loads(pbm_status)
backups = pbm_status["backups"].get("snapshot", [])
for backup in backups:
if backup_id == backup["name"]:
return backup.get("error", "")

return ""
Loading

0 comments on commit 187c491

Please sign in to comment.