Skip to content

Commit

Permalink
Add e2e test case for maintenance mode interaction
Browse files Browse the repository at this point in the history
  • Loading branch information
johscheuer committed Jan 11, 2024
1 parent 6d2439f commit ebfba0a
Show file tree
Hide file tree
Showing 3 changed files with 199 additions and 2 deletions.
4 changes: 2 additions & 2 deletions e2e/fixtures/fdb_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -748,8 +748,8 @@ func (fdbCluster *FdbCluster) SetPodAsUnschedulable(pod corev1.Pod) error {

// SetProcessGroupsAsUnschedulable sets the provided process groups on the NoSchedule list of the current FoundationDBCluster. This will make
// sure that the Pod is stuck in Pending.
func (fdbCluster *FdbCluster) SetProcessGroupsAsUnschedulable(procesGroups []fdbv1beta2.ProcessGroupID) {
fdbCluster.cluster.Spec.Buggify.NoSchedule = procesGroups
func (fdbCluster *FdbCluster) SetProcessGroupsAsUnschedulable(processGroups []fdbv1beta2.ProcessGroupID) {
fdbCluster.cluster.Spec.Buggify.NoSchedule = processGroups
fdbCluster.UpdateClusterSpec()
}

Expand Down
163 changes: 163 additions & 0 deletions e2e/test_operator_maintenance_mode/operator_maintenance_mode_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
/*
* operator_maintenance_mode_test.go
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2024 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package operatorha

/*
This test suite includes tests around the interaction of the maintenance mode and the operator.
*/

import (
"fmt"
fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2"
corev1 "k8s.io/api/core/v1"
"log"
"time"

"github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

var (
factory *fixtures.Factory
fdbCluster *fixtures.FdbCluster
testOptions *fixtures.FactoryOptions
)

func init() {
testOptions = fixtures.InitFlags()
}

var _ = BeforeSuite(func() {
factory = fixtures.CreateFactory(testOptions)
fdbCluster = factory.CreateFdbCluster(
fixtures.DefaultClusterConfig(false),
factory.GetClusterOptions()...,
)

// Load some data into the cluster.
factory.CreateDataLoaderIfAbsent(fdbCluster)
})

var _ = AfterSuite(func() {
if CurrentSpecReport().Failed() {
log.Printf("failed due to %s", CurrentSpecReport().FailureMessage())
}
factory.Shutdown()
})

var _ = Describe("Operator maintenance mode tests", Label("e2e"), func() {
AfterEach(func() {
if CurrentSpecReport().Failed() {
factory.DumpState(fdbCluster)
}
Expect(fdbCluster.WaitForReconciliation()).ToNot(HaveOccurred())
factory.StopInvariantCheck()
// Make sure all data is present in the cluster
fdbCluster.EnsureTeamTrackersAreHealthy()
fdbCluster.EnsureTeamTrackersHaveMinReplicas()
})

When("the maintenance mode is set", func() {
var failingStoragePod corev1.Pod
var faultDomain fdbv1beta2.FaultDomain

BeforeEach(func() {
failingStoragePod = fixtures.RandomPickOnePod(fdbCluster.GetStoragePods().Items)

// Set maintenance mode for this Pod
for _, processGroup := range fdbCluster.GetCluster().Status.ProcessGroups {
if processGroup.ProcessClass != fdbv1beta2.ProcessClassStorage {
continue
}

if processGroup.ProcessGroupID == fixtures.GetProcessGroupID(failingStoragePod) {
faultDomain = processGroup.FaultDomain
}
}

// Set the maintenance mode for 4 minutes.
fdbCluster.RunFdbCliCommandInOperator(fmt.Sprintf("maintenance on %s 240", faultDomain), false, 60)

// Set this Pod as unschedulable to keep it pending.
Expect(fdbCluster.SetPodAsUnschedulable(failingStoragePod)).NotTo(HaveOccurred())
})

AfterEach(func() {
// Make sure that the quota is deleted and new PVCs can be created.
Expect(fdbCluster.ClearBuggifyNoSchedule(true)).NotTo(HaveOccurred())
// Reset the maintenance mode
fdbCluster.RunFdbCliCommandInOperator("maintenance off", false, 60)
})

When("the Pod comes back before the maintenance mode times out", func() {
It("should not set the team tracker status to unhealthy", func() {
// Make sure the team tracker status shows healthy for the failed Pod and the maintenance zone is set.
Consistently(func(g Gomega) fdbv1beta2.FaultDomain {
status := fdbCluster.GetStatus()

for _, tracker := range status.Cluster.Data.TeamTrackers {
log.Println(tracker.State.Name, ":", tracker.State.Healthy)
g.Expect(tracker.State.Healthy).To(BeTrue())
}

log.Println("Maintenance Zone:", status.Cluster.MaintenanceZone)
return status.Cluster.MaintenanceZone
}).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(Equal(faultDomain))
})
})

When("the maintenance mode times out", func() {
It("should update the team tracker status to unhealthy", func() {
// Make sure the team tracker status shows healthy for the failed Pod and the maintenance zone is set.
Consistently(func(g Gomega) fdbv1beta2.FaultDomain {
status := fdbCluster.GetStatus()

for _, tracker := range status.Cluster.Data.TeamTrackers {
g.Expect(tracker.State.Healthy).To(BeTrue())
}

return status.Cluster.MaintenanceZone
}).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(Equal(faultDomain))

log.Println("Wait until maintenance mode times out")
// Wait until the maintenance zone is reset
Eventually(func(g Gomega) fdbv1beta2.FaultDomain {
return fdbCluster.GetStatus().Cluster.MaintenanceZone
}).WithTimeout(10 * time.Minute).WithPolling(2 * time.Second).Should(Equal(fdbv1beta2.FaultDomain("")))

startTime := time.Now()
log.Println("Wait until failure is detected")
// We would expect that the team tracker gets unhealthy once the maintenance mode is timed out.
Eventually(func(g Gomega) fdbv1beta2.FaultDomain {
status := fdbCluster.GetStatus()
for _, tracker := range status.Cluster.Data.TeamTrackers {
g.Expect(tracker.State.Healthy).To(BeFalse())
}

return status.Cluster.MaintenanceZone
}).WithTimeout(10 * time.Minute).WithPolling(2 * time.Second).Should(Equal(fdbv1beta2.FaultDomain("")))

log.Println("It took:", time.Since(startTime).String(), "to detected the failure")
})
})
})
})
34 changes: 34 additions & 0 deletions e2e/test_operator_maintenance_mode/suite_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* suite_test.go
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2024 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package operatorha

import (
"testing"
"time"

"github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures"
"github.com/onsi/gomega"
)

func TestOperatorHA(t *testing.T) {
gomega.SetDefaultEventuallyTimeout(10 * time.Second)
fixtures.RunGinkgoTests(t, "Operator maintenance mode test suite")
}

0 comments on commit ebfba0a

Please sign in to comment.