diff --git a/e2e/fixtures/chaos_stress.go b/e2e/fixtures/chaos_stress.go new file mode 100644 index 00000000..8d182abc --- /dev/null +++ b/e2e/fixtures/chaos_stress.go @@ -0,0 +1,52 @@ +/* + * chaos_stress.go + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2021-2040 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package fixtures + +import ( + chaosmesh "github.com/chaos-mesh/chaos-mesh/api/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/pointer" +) + +// InjectPodStress injects pod stress on the target. +func (factory *Factory) InjectPodStress(target chaosmesh.PodSelectorSpec, containerNames []string, memoryStressor *chaosmesh.MemoryStressor, cpuStressor *chaosmesh.CPUStressor) *ChaosMeshExperiment { + return factory.CreateExperiment(&chaosmesh.StressChaos{ + ObjectMeta: metav1.ObjectMeta{ + Name: factory.RandStringRunes(32), + Namespace: factory.GetChaosNamespace(), + Labels: factory.GetDefaultLabels(), + }, + Spec: chaosmesh.StressChaosSpec{ + Duration: pointer.String(ChaosDurationForever), + Stressors: &chaosmesh.Stressors{ + MemoryStressor: memoryStressor, + CPUStressor: cpuStressor, + }, + ContainerSelector: chaosmesh.ContainerSelector{ + PodSelector: chaosmesh.PodSelector{ + Selector: target, + Mode: chaosmesh.AllMode, + }, + ContainerNames: containerNames, + }, + }, + }) +} diff --git a/e2e/fixtures/factory.go b/e2e/fixtures/factory.go index 33fab939..f0072191 100644 --- a/e2e/fixtures/factory.go +++ b/e2e/fixtures/factory.go @@ -286,7 +286,7 @@ func (factory *Factory) GetSidecarContainerOverrides(debugSymbols bool) fdbv1bet func (factory *Factory) getClusterName() string { if factory.options.clusterName == "" { - return fmt.Sprintf("fdb-cluster-%s", factory.RandStringRunes(8)) + return fmt.Sprintf("%s-%s", testSuiteName, factory.RandStringRunes(8)) } return factory.options.clusterName diff --git a/e2e/fixtures/fdb_data_loader.go b/e2e/fixtures/fdb_data_loader.go index 3b5f6fff..b242cd3f 100644 --- a/e2e/fixtures/fdb_data_loader.go +++ b/e2e/fixtures/fdb_data_loader.go @@ -25,6 +25,7 @@ import ( "context" "errors" "io" + k8serrors "k8s.io/apimachinery/pkg/api/errors" "log" "text/template" "time" @@ -312,6 +313,12 @@ func (factory *Factory) getDataLoaderConfig(cluster *FdbCluster) *dataLoaderConf // CreateDataLoaderIfAbsent will create the data loader for the provided cluster and load some random data into the cluster. func (factory *Factory) CreateDataLoaderIfAbsent(cluster *FdbCluster) { + factory.CreateDataLoaderIfAbsentWithWait(cluster, true) +} + +// CreateDataLoaderIfAbsentWithWait will create the data loader for the provided cluster and load some random data into the cluster. +// If wait is true, the method will wait until the data loader has finished. +func (factory *Factory) CreateDataLoaderIfAbsentWithWait(cluster *FdbCluster, wait bool) { if !factory.options.enableDataLoading { return } @@ -347,15 +354,27 @@ func (factory *Factory) CreateDataLoaderIfAbsent(cluster *FdbCluster) { ).NotTo(gomega.HaveOccurred()) } + if !wait { + return + } + factory.WaitUntilDataLoaderIsDone(cluster) + factory.DeleteDataLoader(cluster) +} +// DeleteDataLoader will delete the data loader job +func (factory *Factory) DeleteDataLoader(cluster *FdbCluster) { // Remove data loader Pods again, as the loading was done. - gomega.Expect(factory.controllerRuntimeClient.Delete(context.Background(), &batchv1.Job{ + err := factory.controllerRuntimeClient.Delete(context.Background(), &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: dataLoaderName, Namespace: cluster.Namespace(), }, - })).NotTo(gomega.HaveOccurred()) + }) + + if err != nil && !k8serrors.IsNotFound(err) { + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } gomega.Expect(factory.controllerRuntimeClient.DeleteAllOf(context.Background(), &corev1.Pod{}, client.InNamespace(cluster.Namespace()), diff --git a/e2e/fixtures/fdb_operator_fixtures.go b/e2e/fixtures/fdb_operator_fixtures.go index d6f723e7..96217776 100644 --- a/e2e/fixtures/fdb_operator_fixtures.go +++ b/e2e/fixtures/fdb_operator_fixtures.go @@ -140,9 +140,6 @@ func (factory *Factory) ensureHaMemberClusterExists( fetchedCluster.cluster.Spec.SeedConnectionString = seedConnection log.Printf("update cluster: %s/%s", curCluster.cluster.Namespace, curCluster.cluster.Name) fetchedCluster.UpdateClusterSpec() - if err != nil { - return err - } } return haFdbCluster.addCluster(fetchedCluster) diff --git a/e2e/fixtures/ha_fdb_cluster.go b/e2e/fixtures/ha_fdb_cluster.go index 730016a5..8bec5937 100644 --- a/e2e/fixtures/ha_fdb_cluster.go +++ b/e2e/fixtures/ha_fdb_cluster.go @@ -38,11 +38,11 @@ const ( // RemoteID is the suffix for the remote FoundationDBCluster RemoteID = "remote" // PrimarySatelliteID is the suffix for the primary satellite FoundationDBCluster - PrimarySatelliteID = "primary-satellite" + PrimarySatelliteID = "primary-sat" // RemoteSatelliteID is the suffix for the remote satellite FoundationDBCluster - RemoteSatelliteID = "remote-satellite" + RemoteSatelliteID = "remote-sat" // SatelliteID is the suffix for the satellite FoundationDBCluster - SatelliteID = "satellite" + SatelliteID = "sat" ) // HaFdbCluster is a struct around handling HA FoundationDBClusters. diff --git a/e2e/fixtures/kubernetes_fixtures.go b/e2e/fixtures/kubernetes_fixtures.go index 782c1f34..fded7166 100644 --- a/e2e/fixtures/kubernetes_fixtures.go +++ b/e2e/fixtures/kubernetes_fixtures.go @@ -47,7 +47,9 @@ const ( // method will return the namespace name as the username a hyphen and 8 random chars. func (factory *Factory) getRandomizedNamespaceName() string { gomega.Expect(factory.userName).To(gomega.MatchRegexp(namespaceRegEx), "user name contains invalid characters") - return factory.userName + "-" + factory.RandStringRunes(8) + name := factory.userName + "-" + testSuiteName + "-" + factory.RandStringRunes(8) + log.Println("namespace:", name, "length:", len(name)) + return name } // MultipleNamespaces creates multiple namespaces for HA testing. @@ -95,6 +97,8 @@ func (factory *Factory) createNamespace(suffix string) string { namespace = namespace + "-" + suffix } + g.Expect(len(namespace)).To(gomega.BeNumerically("<=", 63)) + err := factory.checkIfNamespaceIsTerminating(namespace) g.Expect(err).NotTo(gomega.HaveOccurred()) diff --git a/e2e/test_operator/operator_test.go b/e2e/test_operator/operator_test.go index feafb54b..8cbb360b 100644 --- a/e2e/test_operator/operator_test.go +++ b/e2e/test_operator/operator_test.go @@ -373,6 +373,92 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() { // fields that we expect are actually set. }) + PWhen("replacing log Pod with high queue", func() { + var experiment *fixtures.ChaosMeshExperiment + + BeforeEach(func() { + spec := fdbCluster.GetCluster().Spec.DeepCopy() + spec.AutomationOptions.UseLocalitiesForExclusion = pointer.Bool(true) + fdbCluster.UpdateClusterSpecWithSpec(spec) + Expect(fdbCluster.GetCluster().UseLocalitiesForExclusion()).To(BeTrue()) + + // Until the race condition is resolved in the FDB go bindings make sure the operator is not restarted. + // See: https://github.com/apple/foundationdb/issues/11222 + // We can remove this once 7.1 is the default version. + factory.DeleteChaosMeshExperimentSafe(scheduleInjectPodKill) + status := fdbCluster.GetStatus() + + var processGroupID fdbv1beta2.ProcessGroupID + for _, process := range status.Cluster.Processes { + var isLog bool + for _, role := range process.Roles { + if role.Role == "log" { + isLog = true + break + } + } + + if !isLog { + continue + } + + processGroupID = fdbv1beta2.ProcessGroupID(process.Locality[fdbv1beta2.FDBLocalityInstanceIDKey]) + break + } + + var replacedPod corev1.Pod + for _, pod := range fdbCluster.GetLogPods().Items { + if fixtures.GetProcessGroupID(pod) != processGroupID { + continue + } + + replacedPod = pod + break + } + + log.Println("Inject chaos") + experiment = factory.InjectPodStress(fixtures.PodSelector(&replacedPod), []string{fdbv1beta2.MainContainerName}, nil, &chaosmesh.CPUStressor{ + Stressor: chaosmesh.Stressor{ + Workers: 1, + }, + Load: pointer.Int(80), + }) + + factory.CreateDataLoaderIfAbsent(fdbCluster) + + time.Sleep(1 * time.Minute) + log.Println("replacedPod", replacedPod.Name, "useLocalitiesForExclusion", fdbCluster.GetCluster().UseLocalitiesForExclusion()) + fdbCluster.ReplacePod(replacedPod, true) + }) + + It("should exclude the server", func() { + Eventually(func() []fdbv1beta2.ExcludedServers { + status := fdbCluster.GetStatus() + excludedServers := status.Cluster.DatabaseConfiguration.ExcludedServers + log.Println("excludedServers", excludedServers) + return excludedServers + }).WithTimeout(15 * time.Minute).WithPolling(1 * time.Second).Should(BeEmpty()) + }) + + AfterEach(func() { + Expect(fdbCluster.ClearProcessGroupsToRemove()).NotTo(HaveOccurred()) + factory.DeleteChaosMeshExperimentSafe(experiment) + // Making sure we included back all the process groups after exclusion is complete. + Expect(fdbCluster.GetStatus().Cluster.DatabaseConfiguration.ExcludedServers).To(BeEmpty()) + + if factory.ChaosTestsEnabled() { + scheduleInjectPodKill = factory.ScheduleInjectPodKillWithName( + fixtures.GetOperatorSelector(fdbCluster.Namespace()), + "*/2 * * * *", + chaosmesh.OneMode, + fdbCluster.Namespace()+"-"+fdbCluster.Name(), + ) + } + + factory.DeleteDataLoader(fdbCluster) + }) + }) + When("replacing a coordinator Pod", func() { var replacedPod corev1.Pod var useLocalitiesForExclusion bool