K8SPS-280: Improve full cluster crash recovery

Before these changes, we were rebooting the cluster from complete outage from pod-0, without checking which member has the latest transactions. Therefore our full cluster recovery implementation was prone to data loss. Now we're using mysql-shell's built-in checks to detect the member to reboot from. For this, mysql-shell requires every member to be reachable, so it can connect and check GTID's in each one. That means in case of full cluster crash we need to start each pod and ensure they're reachable. We're bringing back the `/var/lib/mysql/full-cluster-crash` to address this requirement. Pods create this file if they detect they're in full cluster crash and restart themselves. After the restart, they'll start the mysqld process but ensure the server started as read only. After all pods up and running (ready), the operator will run `dba.rebootClusterFromCompleteOutage()` in one of the MySQL pods. In which pod we run this is not important, since mysql-shell will connect to each pod and select the suitable one to reboot. *Events* This commit also introduces the event recorder and two events: 1. FullClusterCrashDetected 2. FullClusterCrashRecovered Users will be able to see these events on `PerconaServerMySQL` object. For example: ``` $ kubectl describe ps cluster1 ... Events: Type Reason Age From Message ---- ------ ---- ---- ------- Warning FullClusterCrashDetected 19m (x10 over 20m) ps-controller Full cluster crash detected Normal FullClusterCrashRecovered 17m ps-controller Cluster recovered from full cluster crash ``` *Probe timeouts* Kubernetes had some problems with timeouts in exec probes which they fixed in recent releases. But we still see problematic behaviors. For example, even though Kubernetes successfully detects the timeout in probe it doesn't count the timeouts as failure. So container is not restarted even if its liveness probe timed out million times. With this commit we're handling timeouts by ourselves with contexts.
percona · Jul 24, 2023 · a2734f0 · a2734f0
1 parent fd3c06f
commit a2734f0
Show file tree

Hide file tree

Showing 17 changed files with 550 additions and 272 deletions.
diff --git a/api/v1alpha1/perconaservermysql_types.go b/api/v1alpha1/perconaservermysql_types.go
@@ -480,7 +480,7 @@ func (cr *PerconaServerMySQL) CheckNSetDefaults(ctx context.Context, serverVersi
 		cr.Spec.MySQL.LivenessProbe.SuccessThreshold = 1
 	}
 	if cr.Spec.MySQL.LivenessProbe.TimeoutSeconds == 0 {
-		cr.Spec.MySQL.LivenessProbe.TimeoutSeconds = 30
+		cr.Spec.MySQL.LivenessProbe.TimeoutSeconds = 10
 	}
 
 	if cr.Spec.MySQL.ReadinessProbe.InitialDelaySeconds == 0 {

diff --git a/build/ps-entrypoint.sh b/build/ps-entrypoint.sh
@@ -182,6 +182,12 @@ load_group_replication_plugin() {
 	POD_IP=$(hostname -I | awk '{print $1}')
 
 	sed -i "/\[mysqld\]/a plugin_load_add=group_replication.so" $CFG
+	sed -i "/\[mysqld\]/a group_replication_exit_state_action=ABORT_SERVER" $CFG
+}
+
+ensure_read_only() {
+	sed -i "/\[mysqld\]/a read_only=ON" $CFG
+	sed -i "/\[mysqld\]/a super_read_only=ON" $CFG
 }
 
 MYSQL_VERSION=$(mysqld -V | awk '{print $3}' | awk -F'.' '{print $1"."$2}')
@@ -399,34 +405,24 @@ if [[ -f /var/lib/mysql/full-cluster-crash ]]; then
 	namespace=$(</var/run/secrets/kubernetes.io/serviceaccount/namespace)
 
 	echo "######FULL_CLUSTER_CRASH:${node_name}######"
-	echo "You have full cluster crash. You need to recover the cluster manually. Here are the steps:"
-	echo ""
-	echo "Latest GTID_EXECUTED in this node is ${gtid_executed}"
-	echo "Compare GTIDs in each MySQL pod and select the one with the newest GTID."
-	echo ""
-	echo "Create /var/lib/mysql/force-bootstrap inside the mysql container. For example, if you select ${cluster_name}-mysql-2 to recover from:"
-	echo "$ kubectl -n ${namespace} exec ${cluster_name}-mysql-2 -c mysql -- touch /var/lib/mysql/force-bootstrap"
-	echo ""
-	echo "Remove /var/lib/mysql/full-cluster-crash in this pod to re-bootstrap the group. For example:"
-	echo "$ kubectl -n ${namespace} exec ${cluster_name}-mysql-2 -c mysql -- rm /var/lib/mysql/full-cluster-crash"
-	echo "This will restart the mysql container."
-	echo ""
-	echo "After group is bootstrapped and mysql container is ready, move on to the other pods:"
-	echo "$ kubectl -n ${namespace} exec ${cluster_name}-mysql-1 -c mysql -- rm /var/lib/mysql/full-cluster-crash"
-	echo "Wait until the pod ready"
-	echo ""
-	echo "$ kubectl -n ${namespace} exec ${cluster_name}-mysql-0 -c mysql -- rm /var/lib/mysql/full-cluster-crash"
-	echo "Wait until the pod ready"
-	echo ""
-	echo "Continue to other pods if you have more."
-	echo "#####LAST_LINE:${node_name}:${gtid_executed}"
-
-	for (( ; ; )); do
-		if [[ ! -f /var/lib/mysql/full-cluster-crash ]]; then
-			exit 0
-		fi
-		sleep 5
-	done
+	echo "You are in a full cluster crash. Operator will attempt to fix the issue automatically."
+	echo "MySQL pods will be up and running in read only mode."
+	echo "Latest GTID_EXECUTED on this node is ${gtid_executed}"
+	echo "######FULL_CLUSTER_CRASH:${node_name}######"
+
+	ensure_read_only
+fi
+
+recovery_file='/var/lib/mysql/sleep-forever'
+if [ -f "${recovery_file}" ]; then
+  set +o xtrace
+  echo "The $recovery_file file is detected, node is going to infinity loop"
+  echo "If you want to exit from infinity loop you need to remove $recovery_file file"
+  for (( ; ; )); do
+    if [ ! -f "${recovery_file}" ]; then
+      exit 0
+    fi
+  done
 fi
 
 exec "$@"
diff --git a/cmd/bootstrap/async_replication.go b/cmd/bootstrap/async_replication.go
@@ -1,6 +1,7 @@
 package main
 
 import (
+	"context"
 	"log"
 	"os"
 	"path/filepath"
@@ -14,7 +15,7 @@ import (
 	"github.com/percona/percona-server-mysql-operator/pkg/replicator"
 )
 
-func bootstrapAsyncReplication() error {
+func bootstrapAsyncReplication(ctx context.Context) error {
 	timer := stopwatch.NewNamedStopwatch()
 	err := timer.AddMany([]string{"clone", "total"})
 	if err != nil {
@@ -45,7 +46,7 @@ func bootstrapAsyncReplication() error {
 			return errors.Wrap(err, "wait lock removal")
 		}
 	}
-	primary, replicas, err := getTopology(peers)
+	primary, replicas, err := getTopology(ctx, peers)
 	if err != nil {
 		return errors.Wrap(err, "select donor")
 	}
@@ -74,7 +75,7 @@ func bootstrapAsyncReplication() error {
 	}
 	log.Printf("PrimaryIP: %s", primaryIp)
 
-	donor, err := selectDonor(fqdn, primary, replicas)
+	donor, err := selectDonor(ctx, fqdn, primary, replicas)
 	if err != nil {
 		return errors.Wrap(err, "select donor")
 	}
@@ -86,33 +87,33 @@ func bootstrapAsyncReplication() error {
 		return errors.Wrapf(err, "get %s password", apiv1alpha1.UserOperator)
 	}
 
-	db, err := replicator.NewReplicator("operator", operatorPass, podIp, mysql.DefaultAdminPort)
+	db, err := replicator.NewReplicator(ctx, "operator", operatorPass, podIp, mysql.DefaultAdminPort)
 	if err != nil {
 		return errors.Wrap(err, "connect to db")
 	}
 	defer db.Close()
 
-	if err := db.StopReplication(); err != nil {
+	if err := db.StopReplication(ctx); err != nil {
 		return err
 	}
 
 	switch {
 	case donor == "":
-		if err := db.ResetReplication(); err != nil {
+		if err := db.ResetReplication(ctx); err != nil {
 			return err
 		}
 
 		log.Printf("Can't find a donor, we're on our own.")
 		return nil
 	case donor == fqdn:
-		if err := db.ResetReplication(); err != nil {
+		if err := db.ResetReplication(ctx); err != nil {
 			return err
 		}
 
 		log.Printf("I'm the donor and therefore the primary.")
 		return nil
 	case primary == fqdn || primaryIp == podIp:
-		if err := db.ResetReplication(); err != nil {
+		if err := db.ResetReplication(ctx); err != nil {
 			return err
 		}
 
@@ -129,7 +130,7 @@ func bootstrapAsyncReplication() error {
 	log.Printf("Clone required: %t", requireClone)
 	if requireClone {
 		log.Println("Checking if a clone in progress")
-		inProgress, err := db.CloneInProgress()
+		inProgress, err := db.CloneInProgress(ctx)
 		if err != nil {
 			return errors.Wrap(err, "check if a clone in progress")
 		}
@@ -141,7 +142,7 @@ func bootstrapAsyncReplication() error {
 
 		timer.Start("clone")
 		log.Printf("Cloning from %s", donor)
-		err = db.Clone(donor, "operator", operatorPass, mysql.DefaultAdminPort)
+		err = db.Clone(ctx, donor, "operator", operatorPass, mysql.DefaultAdminPort)
 		timer.Stop("clone")
 		if err != nil && !errors.Is(err, replicator.ErrRestartAfterClone) {
 			return errors.Wrapf(err, "clone from donor %s", donor)
@@ -164,7 +165,7 @@ func bootstrapAsyncReplication() error {
 		}
 	}
 
-	rStatus, _, err := db.ReplicationStatus()
+	rStatus, _, err := db.ReplicationStatus(ctx)
 	if err != nil {
 		return errors.Wrap(err, "check replication status")
 	}
@@ -177,23 +178,23 @@ func bootstrapAsyncReplication() error {
 			return errors.Wrapf(err, "get %s password", apiv1alpha1.UserReplication)
 		}
 
-		if err := db.StopReplication(); err != nil {
+		if err := db.StopReplication(ctx); err != nil {
 			return errors.Wrap(err, "stop replication")
 		}
 
-		if err := db.StartReplication(primary, replicaPass, mysql.DefaultPort); err != nil {
+		if err := db.StartReplication(ctx, primary, replicaPass, mysql.DefaultPort); err != nil {
 			return errors.Wrap(err, "start replication")
 		}
 	}
 
-	if err := db.EnableSuperReadonly(); err != nil {
+	if err := db.EnableSuperReadonly(ctx); err != nil {
 		return errors.Wrap(err, "enable super read only")
 	}
 
 	return nil
 }
 
-func getTopology(peers sets.Set[string]) (string, []string, error) {
+func getTopology(ctx context.Context, peers sets.Set[string]) (string, []string, error) {
 	replicas := sets.New[string]()
 	primary := ""
 
@@ -203,18 +204,18 @@ func getTopology(peers sets.Set[string]) (string, []string, error) {
 	}
 
 	for _, peer := range sets.List(peers) {
-		db, err := replicator.NewReplicator("operator", operatorPass, peer, mysql.DefaultAdminPort)
+		db, err := replicator.NewReplicator(ctx, "operator", operatorPass, peer, mysql.DefaultAdminPort)
 		if err != nil {
 			return "", nil, errors.Wrapf(err, "connect to %s", peer)
 		}
 		defer db.Close()
 
-		status, source, err := db.ReplicationStatus()
+		status, source, err := db.ReplicationStatus(ctx)
 		if err != nil {
 			return "", nil, errors.Wrap(err, "check replication status")
 		}
 
-		replicaHost, err := db.ReportHost()
+		replicaHost, err := db.ReportHost(ctx)
 		if err != nil {
 			return "", nil, errors.Wrap(err, "get report_host")
 		}
@@ -241,7 +242,7 @@ func getTopology(peers sets.Set[string]) (string, []string, error) {
 	return primary, sets.List(replicas), nil
 }
 
-func selectDonor(fqdn, primary string, replicas []string) (string, error) {
+func selectDonor(ctx context.Context, fqdn, primary string, replicas []string) (string, error) {
 	donor := ""
 
 	operatorPass, err := getSecret(apiv1alpha1.UserOperator)
@@ -250,7 +251,7 @@ func selectDonor(fqdn, primary string, replicas []string) (string, error) {
 	}
 
 	for _, replica := range replicas {
-		db, err := replicator.NewReplicator("operator", operatorPass, replica, mysql.DefaultAdminPort)
+		db, err := replicator.NewReplicator(ctx, "operator", operatorPass, replica, mysql.DefaultAdminPort)
 		if err != nil {
 			continue
 		}