Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add e2e test case for maintenance mode interaction #1915

Merged
merged 1 commit into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions e2e/fixtures/fdb_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -748,8 +748,8 @@ func (fdbCluster *FdbCluster) SetPodAsUnschedulable(pod corev1.Pod) error {

// SetProcessGroupsAsUnschedulable sets the provided process groups on the NoSchedule list of the current FoundationDBCluster. This will make
// sure that the Pod is stuck in Pending.
func (fdbCluster *FdbCluster) SetProcessGroupsAsUnschedulable(procesGroups []fdbv1beta2.ProcessGroupID) {
fdbCluster.cluster.Spec.Buggify.NoSchedule = procesGroups
func (fdbCluster *FdbCluster) SetProcessGroupsAsUnschedulable(processGroups []fdbv1beta2.ProcessGroupID) {
fdbCluster.cluster.Spec.Buggify.NoSchedule = processGroups
fdbCluster.UpdateClusterSpec()
}

Expand Down
163 changes: 163 additions & 0 deletions e2e/test_operator_maintenance_mode/operator_maintenance_mode_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
/*
* operator_maintenance_mode_test.go
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2024 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package operatorha

/*
This test suite includes tests around the interaction of the maintenance mode and the operator.
*/

import (
"fmt"
fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2"
corev1 "k8s.io/api/core/v1"
"log"
"time"

"github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)

var (
factory *fixtures.Factory
fdbCluster *fixtures.FdbCluster
testOptions *fixtures.FactoryOptions
)

func init() {
testOptions = fixtures.InitFlags()
}

var _ = BeforeSuite(func() {
factory = fixtures.CreateFactory(testOptions)
fdbCluster = factory.CreateFdbCluster(
fixtures.DefaultClusterConfig(false),
factory.GetClusterOptions()...,
)

// Load some data into the cluster.
factory.CreateDataLoaderIfAbsent(fdbCluster)
})

var _ = AfterSuite(func() {
if CurrentSpecReport().Failed() {
log.Printf("failed due to %s", CurrentSpecReport().FailureMessage())
}
factory.Shutdown()
})

var _ = Describe("Operator maintenance mode tests", Label("e2e"), func() {
AfterEach(func() {
if CurrentSpecReport().Failed() {
factory.DumpState(fdbCluster)
}
Expect(fdbCluster.WaitForReconciliation()).ToNot(HaveOccurred())
factory.StopInvariantCheck()
// Make sure all data is present in the cluster
fdbCluster.EnsureTeamTrackersAreHealthy()
fdbCluster.EnsureTeamTrackersHaveMinReplicas()
})

When("the maintenance mode is set", func() {
var failingStoragePod corev1.Pod
var faultDomain fdbv1beta2.FaultDomain

BeforeEach(func() {
failingStoragePod = fixtures.RandomPickOnePod(fdbCluster.GetStoragePods().Items)

// Set maintenance mode for this Pod
for _, processGroup := range fdbCluster.GetCluster().Status.ProcessGroups {
if processGroup.ProcessClass != fdbv1beta2.ProcessClassStorage {
continue
}

if processGroup.ProcessGroupID == fixtures.GetProcessGroupID(failingStoragePod) {
faultDomain = processGroup.FaultDomain
}
}

// Set the maintenance mode for 4 minutes.
fdbCluster.RunFdbCliCommandInOperator(fmt.Sprintf("maintenance on %s 240", faultDomain), false, 60)

// Set this Pod as unschedulable to keep it pending.
Expect(fdbCluster.SetPodAsUnschedulable(failingStoragePod)).NotTo(HaveOccurred())
})

AfterEach(func() {
// Make sure that the quota is deleted and new PVCs can be created.
Expect(fdbCluster.ClearBuggifyNoSchedule(true)).NotTo(HaveOccurred())
// Reset the maintenance mode
fdbCluster.RunFdbCliCommandInOperator("maintenance off", false, 60)
})

When("the Pod comes back before the maintenance mode times out", func() {
It("should not set the team tracker status to unhealthy", func() {
// Make sure the team tracker status shows healthy for the failed Pod and the maintenance zone is set.
Consistently(func(g Gomega) fdbv1beta2.FaultDomain {
status := fdbCluster.GetStatus()

for _, tracker := range status.Cluster.Data.TeamTrackers {
log.Println(tracker.State.Name, ":", tracker.State.Healthy)
g.Expect(tracker.State.Healthy).To(BeTrue())
}

log.Println("Maintenance Zone:", status.Cluster.MaintenanceZone)
return status.Cluster.MaintenanceZone
}).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(Equal(faultDomain))
})
})

When("the maintenance mode times out", func() {
It("should update the team tracker status to unhealthy", func() {
// Make sure the team tracker status shows healthy for the failed Pod and the maintenance zone is set.
Consistently(func(g Gomega) fdbv1beta2.FaultDomain {
status := fdbCluster.GetStatus()

for _, tracker := range status.Cluster.Data.TeamTrackers {
g.Expect(tracker.State.Healthy).To(BeTrue())
}

return status.Cluster.MaintenanceZone
}).WithTimeout(2 * time.Minute).WithPolling(2 * time.Second).Should(Equal(faultDomain))

log.Println("Wait until maintenance mode times out")
// Wait until the maintenance zone is reset
Eventually(func(g Gomega) fdbv1beta2.FaultDomain {
return fdbCluster.GetStatus().Cluster.MaintenanceZone
}).WithTimeout(10 * time.Minute).WithPolling(2 * time.Second).Should(Equal(fdbv1beta2.FaultDomain("")))

startTime := time.Now()
log.Println("Wait until failure is detected")
// We would expect that the team tracker gets unhealthy once the maintenance mode is timed out.
Eventually(func(g Gomega) fdbv1beta2.FaultDomain {
status := fdbCluster.GetStatus()
for _, tracker := range status.Cluster.Data.TeamTrackers {
g.Expect(tracker.State.Healthy).To(BeFalse())
}

return status.Cluster.MaintenanceZone
}).WithTimeout(10 * time.Minute).WithPolling(2 * time.Second).Should(Equal(fdbv1beta2.FaultDomain("")))
johscheuer marked this conversation as resolved.
Show resolved Hide resolved

log.Println("It took:", time.Since(startTime).String(), "to detected the failure")
})
})
})
})
34 changes: 34 additions & 0 deletions e2e/test_operator_maintenance_mode/suite_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* suite_test.go
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2024 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package operatorha

import (
"testing"
"time"

"github.com/FoundationDB/fdb-kubernetes-operator/e2e/fixtures"
"github.com/onsi/gomega"
)

func TestOperatorHA(t *testing.T) {
gomega.SetDefaultEventuallyTimeout(10 * time.Second)
fixtures.RunGinkgoTests(t, "Operator maintenance mode test suite")
}
Loading