From 9997dd934bb811f1329f748e2dba618dfc9b4e3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vin=C3=ADcius=20Miguel?= <36349314+vrmiguel@users.noreply.github.com> Date: Mon, 11 Nov 2024 19:23:28 -0300 Subject: [PATCH] Stop CNPG reconciliation for hibernated instances (#1038) --- conductor/Cargo.lock | 2 + tembo-operator/src/cloudnativepg/hibernate.rs | 139 +++++++++++++++++- 2 files changed, 138 insertions(+), 3 deletions(-) diff --git a/conductor/Cargo.lock b/conductor/Cargo.lock index 2f14835ef..8ca70f65c 100644 --- a/conductor/Cargo.lock +++ b/conductor/Cargo.lock @@ -1068,6 +1068,7 @@ dependencies = [ "env_logger", "futures", "google-cloud-storage", + "idna 0.5.0", "k8s-openapi", "kube", "log", @@ -1083,6 +1084,7 @@ dependencies = [ "sqlx", "thiserror", "tokio", + "url", "uuid", ] diff --git a/tembo-operator/src/cloudnativepg/hibernate.rs b/tembo-operator/src/cloudnativepg/hibernate.rs index 28b8aa0dc..d1f26859e 100644 --- a/tembo-operator/src/cloudnativepg/hibernate.rs +++ b/tembo-operator/src/cloudnativepg/hibernate.rs @@ -1,4 +1,5 @@ use crate::apis::coredb_types::CoreDB; +use crate::cloudnativepg::clusters::{ClusterStatusConditions, ClusterStatusConditionsStatus}; use crate::cloudnativepg::cnpg::{get_cluster, get_pooler, get_scheduled_backups}; use crate::cloudnativepg::poolers::Pooler; use crate::cloudnativepg::scheduledbackups::ScheduledBackup; @@ -21,6 +22,8 @@ use std::sync::Arc; use std::time::Duration; use tracing::{debug, error, info, warn}; +use super::clusters::Cluster; + /// Resolves hibernation in the Cluster and related services of the CoreDB /// /// If the cluster is in spec.stop state, this will activate the CNPG hibernation @@ -142,18 +145,31 @@ pub async fn reconcile_cluster_hibernation(cdb: &CoreDB, ctx: &Arc) -> } } - // Build the hibernation patch we want to apply to disable the CNPG cluster. + // Stop CNPG reconciliation for hibernated instances. + // We should not stop CNPG reconciliation until hibernation is fully completed, + // as the instance may not finish hibernating otherwise. + // + // Disabling reconciliation for stopped instances is important because, as the number + // of stopped instances grows, reconciliation performance is significantly impacted + let stop_cnpg_reconciliation = cdb.spec.stop && is_cluster_hibernated(&cluster); + let stop_cnpg_reconciliation_value = if stop_cnpg_reconciliation { + "disabled" + } else { + "enabled" + }; let cluster_annotations = cluster.metadata.annotations.unwrap_or_default(); let hibernation_value = if cdb.spec.stop { "on" } else { "off" }; + + // Build the hibernation patch we want to apply to disable the CNPG cluster. let patch_hibernation_annotation = json!({ "metadata": { "annotations": { - "cnpg.io/hibernation": hibernation_value + "cnpg.io/hibernation": hibernation_value, + "cnpg.io/reconciliationLoop": stop_cnpg_reconciliation_value, } } }); - // Update ScheduledBackup if it exists if let Err(action) = update_scheduled_backups(&scheduled_backups, cdb, ctx).await { warn!( @@ -333,3 +349,120 @@ async fn update_scheduled_backups( Ok(()) } + +fn is_cluster_hibernated(cluster: &Cluster) -> bool { + fn get_hibernation_condition(cluster: &Cluster) -> Option<&ClusterStatusConditions> { + cluster + .status + .as_ref()? + .conditions + .as_ref()? + .iter() + .find(|condition| condition.r#type == "cnpg.io/hibernation") + } + + get_hibernation_condition(cluster) + .map(|condition| condition.status == ClusterStatusConditionsStatus::True) + .unwrap_or( + // If we did not find a cnpg.io/hibernation annotation, likely the cluster has never been hibernated + false, + ) +} + +#[cfg(test)] +mod tests { + use kube::api::ObjectMeta; + + use crate::cloudnativepg::{ + clusters::{ + Cluster, ClusterSpec, ClusterStatus, ClusterStatusConditions, + ClusterStatusConditionsStatus, + }, + hibernate::is_cluster_hibernated, + }; + + #[test] + fn test_is_cluster_hibernated() { + // Not hibernated yet: still in progress + assert_eq!(is_cluster_hibernated(&hibernation_in_progress()), false); + // Not hibernated: unrelated condition + assert_eq!(is_cluster_hibernated(&backed_up_cluster()), false); + // Hibernated: "type" is "cnpg.io/hibernation" and "status" is "True" + assert!(is_cluster_hibernated(&hibernation_completed())); + } + + fn hibernation_in_progress() -> Cluster { + Cluster { + metadata: ObjectMeta { + name: Some("test-cluster".to_string()), + namespace: Some("test".to_string()), + ..ObjectMeta::default() + }, + spec: ClusterSpec { + ..Default::default() + }, + status: Some(ClusterStatus { + instances: Some(1), + conditions: Some(vec![ClusterStatusConditions { + last_transition_time: "2024-11-11T19:33:58Z".into(), + message: "Hibernation is in progress".into(), + observed_generation: None, + reason: "DeletingPods".into(), + status: ClusterStatusConditionsStatus::False, + r#type: "cnpg.io/hibernation".into(), + }]), + ..ClusterStatus::default() + }), + } + } + + fn hibernation_completed() -> Cluster { + Cluster { + metadata: ObjectMeta { + name: Some("test-cluster".to_string()), + namespace: Some("test".to_string()), + ..ObjectMeta::default() + }, + spec: ClusterSpec { + ..Default::default() + }, + status: Some(ClusterStatus { + instances: Some(1), + conditions: Some(vec![ClusterStatusConditions { + last_transition_time: "2024-11-11T19:33:58Z".into(), + message: "Cluster has been hibernated".into(), + observed_generation: None, + reason: "Hibernated".into(), + status: ClusterStatusConditionsStatus::True, + r#type: "cnpg.io/hibernation".into(), + }]), + ..ClusterStatus::default() + }), + } + } + + fn backed_up_cluster() -> Cluster { + Cluster { + metadata: ObjectMeta { + name: Some("test-cluster".to_string()), + namespace: Some("test".to_string()), + ..ObjectMeta::default() + }, + spec: ClusterSpec { + ..Default::default() + }, + status: Some(ClusterStatus { + instances: Some(1), + conditions: Some(vec![ClusterStatusConditions { + last_transition_time: "2024-11-11T19:33:58Z".into(), + message: "Backup was successful".into(), + observed_generation: None, + reason: "LastBackupSucceeded".into(), + status: ClusterStatusConditionsStatus::True, + r#type: "LastBackupSucceeded".into(), + }]), + ..ClusterStatus::default() + }), + } + } +}