Skip to content

Commit

Permalink
K8SPS-280: Improve full cluster crash recovery
Browse files Browse the repository at this point in the history
Before these changes, we were rebooting the cluster from complete outage
from pod-0, without checking which member has the latest transactions.
Therefore our full cluster recovery implementation was prone to data
loss.

Now we're using mysql-shell's built-in checks to detect the member to
reboot from. For this, mysql-shell requires every member to be
reachable, so it can connect and check GTID's in each one. That means in
case of full cluster crash we need to start each pod and ensure they're
reachable.

We're bringing back the `/var/lib/mysql/full-cluster-crash` to address
this requirement. Pods create this file if they detect they're in full
cluster crash and restart themselves. After the restart, they'll start
the mysqld process but ensure the server started as read only. After
all pods up and running (ready), the operator will run
`dba.rebootClusterFromCompleteOutage()` in one of the MySQL pods. In
which pod we run this is not important, since mysql-shell will connect
to each pod and select the suitable one to reboot.

*Events*

This commit also introduces the event recorder and two events:

1. FullClusterCrashDetected
2. FullClusterCrashRecovered

Users will be able to see these events on `PerconaServerMySQL` object.

For example:

```
$ kubectl describe ps cluster1
...
Events:
  Type     Reason                     Age                 From           Message
  ----     ------                     ----                ----           -------
  Warning  FullClusterCrashDetected   19m (x10 over 20m)  ps-controller  Full cluster crash detected
  Normal   FullClusterCrashRecovered  17m                 ps-controller  Cluster recovered from full cluster crash
```

*Probe timeouts*

Kubernetes had some problems with timeouts in exec probes which they
fixed in recent releases. But we still see problematic behaviors. For
example, even though Kubernetes successfully detects the timeout in
probe it doesn't count the timeouts as failure. So container is not
restarted even if its liveness probe timed out million times. With this
commit we're handling timeouts by ourselves with contexts.
  • Loading branch information
egegunes committed Jul 24, 2023
1 parent fd3c06f commit a2734f0
Show file tree
Hide file tree
Showing 17 changed files with 550 additions and 272 deletions.
2 changes: 1 addition & 1 deletion api/v1alpha1/perconaservermysql_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ func (cr *PerconaServerMySQL) CheckNSetDefaults(ctx context.Context, serverVersi
cr.Spec.MySQL.LivenessProbe.SuccessThreshold = 1
}
if cr.Spec.MySQL.LivenessProbe.TimeoutSeconds == 0 {
cr.Spec.MySQL.LivenessProbe.TimeoutSeconds = 30
cr.Spec.MySQL.LivenessProbe.TimeoutSeconds = 10
}

if cr.Spec.MySQL.ReadinessProbe.InitialDelaySeconds == 0 {
Expand Down
52 changes: 24 additions & 28 deletions build/ps-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,12 @@ load_group_replication_plugin() {
POD_IP=$(hostname -I | awk '{print $1}')

sed -i "/\[mysqld\]/a plugin_load_add=group_replication.so" $CFG
sed -i "/\[mysqld\]/a group_replication_exit_state_action=ABORT_SERVER" $CFG
}

ensure_read_only() {
sed -i "/\[mysqld\]/a read_only=ON" $CFG
sed -i "/\[mysqld\]/a super_read_only=ON" $CFG
}

MYSQL_VERSION=$(mysqld -V | awk '{print $3}' | awk -F'.' '{print $1"."$2}')
Expand Down Expand Up @@ -399,34 +405,24 @@ if [[ -f /var/lib/mysql/full-cluster-crash ]]; then
namespace=$(</var/run/secrets/kubernetes.io/serviceaccount/namespace)

Check warning on line 405 in build/ps-entrypoint.sh

View workflow job for this annotation

GitHub Actions / shellcheck

[shellcheck] build/ps-entrypoint.sh#L405 <ShellCheck.SC2034>

namespace appears unused. Verify use (or export if used externally).
Raw output
./build/ps-entrypoint.sh:405:2: warning: namespace appears unused. Verify use (or export if used externally). (ShellCheck.SC2034)

echo "######FULL_CLUSTER_CRASH:${node_name}######"
echo "You have full cluster crash. You need to recover the cluster manually. Here are the steps:"
echo ""
echo "Latest GTID_EXECUTED in this node is ${gtid_executed}"
echo "Compare GTIDs in each MySQL pod and select the one with the newest GTID."
echo ""
echo "Create /var/lib/mysql/force-bootstrap inside the mysql container. For example, if you select ${cluster_name}-mysql-2 to recover from:"
echo "$ kubectl -n ${namespace} exec ${cluster_name}-mysql-2 -c mysql -- touch /var/lib/mysql/force-bootstrap"
echo ""
echo "Remove /var/lib/mysql/full-cluster-crash in this pod to re-bootstrap the group. For example:"
echo "$ kubectl -n ${namespace} exec ${cluster_name}-mysql-2 -c mysql -- rm /var/lib/mysql/full-cluster-crash"
echo "This will restart the mysql container."
echo ""
echo "After group is bootstrapped and mysql container is ready, move on to the other pods:"
echo "$ kubectl -n ${namespace} exec ${cluster_name}-mysql-1 -c mysql -- rm /var/lib/mysql/full-cluster-crash"
echo "Wait until the pod ready"
echo ""
echo "$ kubectl -n ${namespace} exec ${cluster_name}-mysql-0 -c mysql -- rm /var/lib/mysql/full-cluster-crash"
echo "Wait until the pod ready"
echo ""
echo "Continue to other pods if you have more."
echo "#####LAST_LINE:${node_name}:${gtid_executed}"

for (( ; ; )); do
if [[ ! -f /var/lib/mysql/full-cluster-crash ]]; then
exit 0
fi
sleep 5
done
echo "You are in a full cluster crash. Operator will attempt to fix the issue automatically."
echo "MySQL pods will be up and running in read only mode."
echo "Latest GTID_EXECUTED on this node is ${gtid_executed}"
echo "######FULL_CLUSTER_CRASH:${node_name}######"

ensure_read_only
fi

recovery_file='/var/lib/mysql/sleep-forever'
if [ -f "${recovery_file}" ]; then
set +o xtrace
echo "The $recovery_file file is detected, node is going to infinity loop"
echo "If you want to exit from infinity loop you need to remove $recovery_file file"
for (( ; ; )); do
if [ ! -f "${recovery_file}" ]; then
exit 0
fi
done
fi

exec "$@"
41 changes: 21 additions & 20 deletions cmd/bootstrap/async_replication.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package main

import (
"context"
"log"
"os"
"path/filepath"
Expand All @@ -14,7 +15,7 @@ import (
"github.com/percona/percona-server-mysql-operator/pkg/replicator"
)

func bootstrapAsyncReplication() error {
func bootstrapAsyncReplication(ctx context.Context) error {
timer := stopwatch.NewNamedStopwatch()
err := timer.AddMany([]string{"clone", "total"})
if err != nil {
Expand Down Expand Up @@ -45,7 +46,7 @@ func bootstrapAsyncReplication() error {
return errors.Wrap(err, "wait lock removal")
}
}
primary, replicas, err := getTopology(peers)
primary, replicas, err := getTopology(ctx, peers)
if err != nil {
return errors.Wrap(err, "select donor")
}
Expand Down Expand Up @@ -74,7 +75,7 @@ func bootstrapAsyncReplication() error {
}
log.Printf("PrimaryIP: %s", primaryIp)

donor, err := selectDonor(fqdn, primary, replicas)
donor, err := selectDonor(ctx, fqdn, primary, replicas)
if err != nil {
return errors.Wrap(err, "select donor")
}
Expand All @@ -86,33 +87,33 @@ func bootstrapAsyncReplication() error {
return errors.Wrapf(err, "get %s password", apiv1alpha1.UserOperator)
}

db, err := replicator.NewReplicator("operator", operatorPass, podIp, mysql.DefaultAdminPort)
db, err := replicator.NewReplicator(ctx, "operator", operatorPass, podIp, mysql.DefaultAdminPort)
if err != nil {
return errors.Wrap(err, "connect to db")
}
defer db.Close()

if err := db.StopReplication(); err != nil {
if err := db.StopReplication(ctx); err != nil {
return err
}

switch {
case donor == "":
if err := db.ResetReplication(); err != nil {
if err := db.ResetReplication(ctx); err != nil {
return err
}

log.Printf("Can't find a donor, we're on our own.")
return nil
case donor == fqdn:
if err := db.ResetReplication(); err != nil {
if err := db.ResetReplication(ctx); err != nil {
return err
}

log.Printf("I'm the donor and therefore the primary.")
return nil
case primary == fqdn || primaryIp == podIp:
if err := db.ResetReplication(); err != nil {
if err := db.ResetReplication(ctx); err != nil {
return err
}

Expand All @@ -129,7 +130,7 @@ func bootstrapAsyncReplication() error {
log.Printf("Clone required: %t", requireClone)
if requireClone {
log.Println("Checking if a clone in progress")
inProgress, err := db.CloneInProgress()
inProgress, err := db.CloneInProgress(ctx)
if err != nil {
return errors.Wrap(err, "check if a clone in progress")
}
Expand All @@ -141,7 +142,7 @@ func bootstrapAsyncReplication() error {

timer.Start("clone")
log.Printf("Cloning from %s", donor)
err = db.Clone(donor, "operator", operatorPass, mysql.DefaultAdminPort)
err = db.Clone(ctx, donor, "operator", operatorPass, mysql.DefaultAdminPort)
timer.Stop("clone")
if err != nil && !errors.Is(err, replicator.ErrRestartAfterClone) {
return errors.Wrapf(err, "clone from donor %s", donor)
Expand All @@ -164,7 +165,7 @@ func bootstrapAsyncReplication() error {
}
}

rStatus, _, err := db.ReplicationStatus()
rStatus, _, err := db.ReplicationStatus(ctx)
if err != nil {
return errors.Wrap(err, "check replication status")
}
Expand All @@ -177,23 +178,23 @@ func bootstrapAsyncReplication() error {
return errors.Wrapf(err, "get %s password", apiv1alpha1.UserReplication)
}

if err := db.StopReplication(); err != nil {
if err := db.StopReplication(ctx); err != nil {
return errors.Wrap(err, "stop replication")
}

if err := db.StartReplication(primary, replicaPass, mysql.DefaultPort); err != nil {
if err := db.StartReplication(ctx, primary, replicaPass, mysql.DefaultPort); err != nil {
return errors.Wrap(err, "start replication")
}
}

if err := db.EnableSuperReadonly(); err != nil {
if err := db.EnableSuperReadonly(ctx); err != nil {
return errors.Wrap(err, "enable super read only")
}

return nil
}

func getTopology(peers sets.Set[string]) (string, []string, error) {
func getTopology(ctx context.Context, peers sets.Set[string]) (string, []string, error) {
replicas := sets.New[string]()
primary := ""

Expand All @@ -203,18 +204,18 @@ func getTopology(peers sets.Set[string]) (string, []string, error) {
}

for _, peer := range sets.List(peers) {
db, err := replicator.NewReplicator("operator", operatorPass, peer, mysql.DefaultAdminPort)
db, err := replicator.NewReplicator(ctx, "operator", operatorPass, peer, mysql.DefaultAdminPort)
if err != nil {
return "", nil, errors.Wrapf(err, "connect to %s", peer)
}
defer db.Close()

status, source, err := db.ReplicationStatus()
status, source, err := db.ReplicationStatus(ctx)
if err != nil {
return "", nil, errors.Wrap(err, "check replication status")
}

replicaHost, err := db.ReportHost()
replicaHost, err := db.ReportHost(ctx)
if err != nil {
return "", nil, errors.Wrap(err, "get report_host")
}
Expand All @@ -241,7 +242,7 @@ func getTopology(peers sets.Set[string]) (string, []string, error) {
return primary, sets.List(replicas), nil
}

func selectDonor(fqdn, primary string, replicas []string) (string, error) {
func selectDonor(ctx context.Context, fqdn, primary string, replicas []string) (string, error) {
donor := ""

operatorPass, err := getSecret(apiv1alpha1.UserOperator)
Expand All @@ -250,7 +251,7 @@ func selectDonor(fqdn, primary string, replicas []string) (string, error) {
}

for _, replica := range replicas {
db, err := replicator.NewReplicator("operator", operatorPass, replica, mysql.DefaultAdminPort)
db, err := replicator.NewReplicator(ctx, "operator", operatorPass, replica, mysql.DefaultAdminPort)
if err != nil {
continue
}
Expand Down
Loading

0 comments on commit a2734f0

Please sign in to comment.