Skip to content

Commit

Permalink
Add support for multi-region recover subcommand with DNS (#2181)
Browse files Browse the repository at this point in the history
* Add support for multi-region recover subcommand with DNS

* Improve reliability of restart command
  • Loading branch information
johscheuer authored Dec 9, 2024
1 parent 43800d4 commit 17c77df
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 12 deletions.
5 changes: 2 additions & 3 deletions e2e/test_operator_plugin/operator_plugin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,7 @@ var _ = Describe("Operator Plugin", Label("e2e", "pr"), func() {
})
})

// TODO(johscheuer): Enable once https://github.com/FoundationDB/fdb-kubernetes-operator/issues/2153 is fixed.
PWhen("all Pods in the primary and satellites are down with", func() {
When("all Pods in the primary and satellites are down", func() {
BeforeEach(func() {
runningVersion := fdbCluster.GetPrimary().GetCluster().GetRunningVersion()
parsedVersion, err := fdbv1beta2.ParseFdbVersion(runningVersion)
Expand All @@ -174,7 +173,7 @@ var _ = Describe("Operator Plugin", Label("e2e", "pr"), func() {
}
})

When("DNS names in the cluster file are supported", func() {
When("DNS names in the cluster file are used", func() {
BeforeEach(func() {
var errGroup errgroup.Group
// Enable DNS names in the cluster file for the whole cluster.
Expand Down
74 changes: 65 additions & 9 deletions kubectl-fdb/cmd/recover_multi_region_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@ import (
fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2"
"github.com/FoundationDB/fdb-kubernetes-operator/internal"
kubeHelper "github.com/FoundationDB/fdb-kubernetes-operator/internal/kubernetes"
"github.com/go-logr/logr"
"github.com/spf13/cobra"
corev1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/cli-runtime/pkg/genericclioptions"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -145,6 +147,7 @@ func recoverMultiRegionCluster(cmd *cobra.Command, opts recoverMultiRegionCluste

cmd.Println("current connection string", lastConnectionString)

usesDNSInClusterFile := cluster.UseDNSInClusterFile()
var useTLS bool
coordinators := map[string]fdbv1beta2.ProcessAddress{}
for _, addr := range addresses {
Expand All @@ -159,7 +162,7 @@ func recoverMultiRegionCluster(cmd *cobra.Command, opts recoverMultiRegionCluste
_, useTLS = parsed.Flags["tls"]
}

cmd.Println("current coordinators", coordinators, "useTLS", useTLS)
cmd.Println("Current coordinators", coordinators, "useTLS", useTLS)
// Fetch all Pods and coordinators for the remote and remote satellite.
runningCoordinators := map[string]fdbv1beta2.None{}
newCoordinators := make([]fdbv1beta2.ProcessAddress, 0, 5)
Expand All @@ -177,12 +180,26 @@ func recoverMultiRegionCluster(cmd *cobra.Command, opts recoverMultiRegionCluste
// Find a running coordinator to copy the coordinator files from.
var runningCoordinator *corev1.Pod
for _, pod := range pods.Items {
addr, parseErr := fdbv1beta2.ParseProcessAddress(pod.Status.PodIP)
if parseErr != nil {
return parseErr
var addr fdbv1beta2.ProcessAddress
if usesDNSInClusterFile {
dnsName := internal.GetPodDNSName(cluster, pod.GetName())
addr = fdbv1beta2.ProcessAddress{StringAddress: dnsName}
} else {
currentPod := pod
publicIPs := internal.GetPublicIPsForPod(&currentPod, logr.Discard())
if len(publicIPs) == 0 {
cmd.Println("Found no public IPs for pod:", pod.Name)
continue
}

var parseErr error
addr, parseErr = fdbv1beta2.ParseProcessAddress(publicIPs[0])
if parseErr != nil {
return parseErr
}
}

cmd.Println("checking pod:", pod.Name, "address:", addr, "pod IPs:", pod.Status.PodIP, "machineAddr:", addr.MachineAddress())
cmd.Println("Checking pod", pod.Name, "address", addr.MachineAddress())

loopPod := pod
if coordinatorAddr, ok := coordinators[addr.MachineAddress()]; ok {
Expand Down Expand Up @@ -233,7 +250,7 @@ func recoverMultiRegionCluster(cmd *cobra.Command, opts recoverMultiRegionCluste
if parseErr != nil {
return parseErr
}
cmd.Println("Adding pod as new coordinators:", candidate.Name)
cmd.Println("Adding pod as new coordinator:", candidate.Name)
if useTLS {
addr.Port = 4500
addr.Flags = map[string]bool{"tls": true}
Expand Down Expand Up @@ -343,7 +360,7 @@ func recoverMultiRegionCluster(cmd *cobra.Command, opts recoverMultiRegionCluste

cmd.Println("Killing fdbserver processes")
// Now all Pods must be restarted and the previous local cluster file must be deleted to make sure the fdbserver is picking the connection string from the seed cluster file (`/var/dynamic-conf/fdb.cluster`).
err = restartFdbserverInCluster(cmd.Context(), opts.client, opts.config, cluster)
err = restartFdbserverInCluster(cmd.Context(), cmd, opts.client, opts.config, cluster)
if err != nil {
return err
}
Expand Down Expand Up @@ -443,16 +460,55 @@ func uploadCoordinatorFile(cmd *cobra.Command, kubeClient client.Client, config
return kubeHelper.UploadFile(cmd.Context(), kubeClient, config, pod, fdbv1beta2.MainContainerName, tmpCoordinatorFile, dst)
}

func restartFdbserverInCluster(ctx context.Context, kubeClient client.Client, config *rest.Config, cluster *fdbv1beta2.FoundationDBCluster) error {
// restartFdbserverInCluster will try to restart all fdbserver processes inside all the pods of the cluster. If the restart fails, it will be retried again two more times.
func restartFdbserverInCluster(ctx context.Context, cmd *cobra.Command, kubeClient client.Client, config *rest.Config, cluster *fdbv1beta2.FoundationDBCluster) error {
pods, err := getRunningPodsForCluster(ctx, kubeClient, cluster)
if err != nil {
return err
}

// Now all Pods must be restarted and the previous local cluster file must be deleted to make sure the fdbserver is picking the connection string from the seed cluster file (`/var/dynamic-conf/fdb.cluster`).
retryRestart := make([]corev1.Pod, 0, len(pods.Items))
for _, pod := range pods.Items {
_, _, err := kubeHelper.ExecuteCommand(context.Background(), kubeClient, config, pod.Namespace, pod.Name, fdbv1beta2.MainContainerName, "pkill fdbserver && rm -f /var/fdb/data/fdb.cluster && pkill fdbserver || true", false)
var stderr string
_, _, err = kubeHelper.ExecuteCommand(context.Background(), kubeClient, config, pod.Namespace, pod.Name, fdbv1beta2.MainContainerName, "pkill fdbserver && rm -f /var/fdb/data/fdb.cluster && pkill fdbserver || true", false)
if err != nil {
// If the pod doesn't exist anymore ignore the error. The pod will have the new configuration when recreated again.
if k8serrors.IsNotFound(err) {
continue
}

time.Sleep(1 * time.Second)
cmd.Println("error restarting process in pod", pod.Name, "got error", err.Error(), "will be directly retried, stderr:", stderr)
_, stderr, err = kubeHelper.ExecuteCommand(context.Background(), kubeClient, config, pod.Namespace, pod.Name, fdbv1beta2.MainContainerName, "pkill fdbserver && rm -f /var/fdb/data/fdb.cluster && pkill fdbserver || true", false)
if err != nil {
cmd.Println("error restarting process in pod", pod.Name, "got error", err.Error(), "will be retried later, stderr:", stderr)
retryRestart = append(retryRestart, pod)
}
}
}

if len(retryRestart) == 0 {
return nil
}

// If we have more than one pod where we failed to restart the fdbserver processes, wait ten seconds before trying again.
time.Sleep(10 * time.Second)
cmd.Println("Failed to restart the fdbserver processes in", len(retryRestart), "pods, will be retried now.")

for _, pod := range retryRestart {
// Pod is marked for deletion, so we can skip it here.
if !pod.DeletionTimestamp.IsZero() {
continue
}

_, _, err = kubeHelper.ExecuteCommand(context.Background(), kubeClient, config, pod.Namespace, pod.Name, fdbv1beta2.MainContainerName, "pkill fdbserver && rm -f /var/fdb/data/fdb.cluster && pkill fdbserver || true", false)
if err != nil {
// If the pod doesn't exist anymore ignore the error. The pod will have the new configuration when recreated again.
if k8serrors.IsNotFound(err) {
continue
}

return err
}
}
Expand Down

0 comments on commit 17c77df

Please sign in to comment.