Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP Test e2e #223

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 53 additions & 52 deletions e2e/self_node_remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import (
const (
disconnectCommand = "ip route add blackhole %s"
reconnectCommand = "ip route delete blackhole %s"
nodeExecTimeout = 20 * time.Second
nodeExecTimeout = 120 * time.Second
reconnectInterval = 300 * time.Second
skipLogsEnvVarName = "SKIP_LOG_VERIFICATION"
skipOOSREnvVarName = "SKIP_OOST_REMEDIATION_VERIFICATION"
Expand Down Expand Up @@ -75,11 +75,12 @@ var _ = Describe("Self Node Remediation E2E", func() {

AfterEach(func() {
// restart snr pods for resetting logs...
restartSnrPods(workers)
//restartSnrPods(workers)
})

JustAfterEach(func() {
printSNRLogsFromNode(&workers.Items[1])
// TODO why worker 1??
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No idea, it must be a mistake

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah no, it comes from a "refactory", originally it was the log of healthyNode := &workers.Items[1]

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could've kept at least the variable name to make it clearer

//printSNRLogsFromNode(&workers.Items[1])
})

Describe("With API connectivity", func() {
Expand Down Expand Up @@ -146,15 +147,16 @@ var _ = Describe("Self Node Remediation E2E", func() {

Describe("Without API connectivity", func() {
Context("Healthy node (no SNR)", func() {

// no api connectivity
// a) healthy
// - kill connectivity on one node
// - wait until connection restored
// - verify node did not reboot and wasn't deleted
// - verify peer check did happen

var testStartTime *metav1.Time
BeforeEach(func() {
testStartTime = &metav1.Time{time.Now()}
killApiConnection(node, apiIPs, true)
})

Expand All @@ -168,10 +170,10 @@ var _ = Describe("Self Node Remediation E2E", func() {
checkNoNodeRecreate(node, oldUID)
checkNoReboot(node, oldBootTime)

if _, isExist := os.LookupEnv(skipLogsEnvVarName); !isExist {
// check logs to make sure that the actual peer health check did run
checkSnrLogs(node, []string{"failed to check api server", "Peer told me I'm healthy."})
}
//if _, isExist := os.LookupEnv(skipLogsEnvVarName); !isExist {
// check logs to make sure that the actual peer health check did run
checkSnrLogs(node, []string{"failed to check api server", "Peer told me I'm healthy."}, testStartTime)
//}
})
})

Expand All @@ -185,8 +187,10 @@ var _ = Describe("Self Node Remediation E2E", func() {

var snr *v1alpha1.SelfNodeRemediation
var oldPodCreationTime time.Time
var testStartTime *metav1.Time

BeforeEach(func() {
testStartTime = &metav1.Time{time.Now()}
killApiConnection(node, apiIPs, false)
snr = createSNR(node, v1alpha1.ResourceDeletionRemediationStrategy)
oldPodCreationTime = findSnrPod(node).CreationTimestamp.Time
Expand All @@ -204,11 +208,11 @@ var _ = Describe("Self Node Remediation E2E", func() {
// - because the 2nd check has a small timeout only
checkReboot(node, oldBootTime)
checkPodRecreated(node, oldPodCreationTime)
if _, isExist := os.LookupEnv(skipLogsEnvVarName); !isExist {
// we can't check logs of unhealthy node anymore, check peer logs
peer := &workers.Items[1]
checkSnrLogs(peer, []string{node.GetName(), "node is unhealthy"})
}
//if _, isExist := os.LookupEnv(skipLogsEnvVarName); !isExist {
// we can't check logs of unhealthy node anymore, check peer logs
peer := &workers.Items[1]
checkSnrLogs(peer, []string{node.GetName(), "IsHealthy OWNED by NHC unhealthy"}, testStartTime)
//}
})

})
Expand All @@ -222,9 +226,11 @@ var _ = Describe("Self Node Remediation E2E", func() {

uids := make(map[string]types.UID)
bootTimes := make(map[string]*time.Time)
var testStartTime *metav1.Time

BeforeEach(func() {
wg := sync.WaitGroup{}
testStartTime = &metav1.Time{time.Now()}
for i := range workers.Items {
wg.Add(1)
worker := &workers.Items[i]
Expand All @@ -233,7 +239,7 @@ var _ = Describe("Self Node Remediation E2E", func() {
Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(worker), worker)).ToNot(HaveOccurred())
uids[worker.GetName()] = worker.GetUID()

// and the lat boot time
// and the last boot time
t, err := getBootTime(worker)
Expect(err).ToNot(HaveOccurred())
bootTimes[worker.GetName()] = t
Expand Down Expand Up @@ -269,10 +275,10 @@ var _ = Describe("Self Node Remediation E2E", func() {
checkNoNodeRecreate(worker, uids[worker.GetName()])
checkNoReboot(worker, bootTimes[worker.GetName()])

if _, isExist := os.LookupEnv(skipLogsEnvVarName); !isExist {
// check logs to make sure that the actual peer health check did run
checkSnrLogs(worker, []string{"failed to check api server", "nodes couldn't access the api-server"})
}
//if _, isExist := os.LookupEnv(skipLogsEnvVarName); !isExist {
// check logs to make sure that the actual peer health check did run
checkSnrLogs(worker, []string{"failed to check api server", "nodes couldn't access the api-server"}, testStartTime)
//}
}()
}
wg.Wait()
Expand Down Expand Up @@ -311,11 +317,11 @@ var _ = Describe("Self Node Remediation E2E", func() {

AfterEach(func() {
// restart snr pods for resetting logs...
restartSnrPods(controlPlaneNodes)
//restartSnrPods(controlPlaneNodes)
})

JustAfterEach(func() {
printSNRLogsFromNode(&controlPlaneNodes.Items[1])
//printSNRLogsFromNode(&controlPlaneNodes.Items[1])
})

Describe("With API connectivity", func() {
Expand Down Expand Up @@ -385,22 +391,18 @@ func createSNR(node *v1.Node, remediationStrategy v1alpha1.RemediationStrategyTy
}

func getBootTime(node *v1.Node) (*time.Time, error) {
bootTimeCommand := []string{"uptime", "-s"}
var bootTime time.Time
Eventually(func() error {
var bootTime *time.Time
EventuallyWithOffset(1, func() error {
ctx, cancel := context.WithTimeout(context.Background(), nodeExecTimeout)
defer cancel()
bootTimeString, err := utils.ExecCommandOnNode(k8sClient, bootTimeCommand, node, ctx)
if err != nil {
return err
}
bootTime, err = time.Parse("2006-01-02 15:04:05", bootTimeString)
var err error
bootTime, err = utils.GetBootTime(ctx, k8sClientSet, node.GetName(), testNamespace)
if err != nil {
return err
}
return nil
}, 15*time.Minute, 10*time.Second).ShouldNot(HaveOccurred())
return &bootTime, nil
}, 15*time.Minute, 10*time.Second).ShouldNot(HaveOccurred(), "Could not get boot time on target node")
return bootTime, nil
}

func checkNoExecuteTaintRemoved(node *v1.Node) {
Expand Down Expand Up @@ -449,16 +451,15 @@ func checkReboot(node *v1.Node, oldBootTime *time.Time) {
}

func killApiConnection(node *v1.Node, apiIPs []string, withReconnect bool) {
By("killing api connectivity")
msg := fmt.Sprintf("killing api connectivity on NODE: %s and API ep: %v", node.Name, apiIPs)
By(msg)

script := composeScript(disconnectCommand, apiIPs)
script := "dnf -y install iproute && " + composeScript(disconnectCommand, apiIPs)
if withReconnect {
script += fmt.Sprintf(" && sleep %s && ", strconv.Itoa(int(reconnectInterval.Seconds())))
script += composeScript(reconnectCommand, apiIPs)
}

command := []string{"/bin/bash", "-c", script}

var ctx context.Context
var cancel context.CancelFunc
if withReconnect {
Expand All @@ -467,7 +468,7 @@ func killApiConnection(node *v1.Node, apiIPs []string, withReconnect bool) {
ctx, cancel = context.WithTimeout(context.Background(), nodeExecTimeout)
}
defer cancel()
_, err := utils.ExecCommandOnNode(k8sClient, command, node, ctx)
_, err := utils.RunCommandInCluster(ctx, k8sClientSet, node.GetName(), testNamespace, script)

if withReconnect {
//in case the sleep didn't work
Expand Down Expand Up @@ -523,7 +524,7 @@ func checkNoReboot(node *v1.Node, oldBootTime *time.Time) {
}, 5*time.Minute, 10*time.Second).Should(BeTemporally("==", *oldBootTime))
}

func checkSnrLogs(node *v1.Node, expected []string) {
func checkSnrLogs(node *v1.Node, expected []string, since *metav1.Time) {
By("checking logs")
pod := findSnrPod(node)
ExpectWithOffset(1, pod).ToNot(BeNil())
Expand All @@ -535,7 +536,7 @@ func checkSnrLogs(node *v1.Node, expected []string) {

EventuallyWithOffset(1, func() string {
var err error
logs, err := utils.GetLogs(k8sClientSet, pod)
logs, err := utils.GetLogs(k8sClientSet, pod, since)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!

if err != nil {
logger.Error(err, "failed to get logs, might retry")
return ""
Expand Down Expand Up @@ -614,11 +615,11 @@ func getApiIPs() []string {
Namespace: "default",
Name: "kubernetes",
}
ep := &v1.Endpoints{}
ExpectWithOffset(1, k8sClient.Get(context.Background(), key, ep)).ToNot(HaveOccurred())
svc := &v1.Service{}
ExpectWithOffset(1, k8sClient.Get(context.Background(), key, svc)).ToNot(HaveOccurred())
ips := make([]string, 0)
for _, addr := range ep.Subsets[0].Addresses {
ips = append(ips, addr.IP)
for _, addr := range svc.Spec.ClusterIPs {
ips = append(ips, addr)
}
return ips
}
Expand Down Expand Up @@ -657,14 +658,14 @@ func ensureSnrRunning(nodes *v1.NodeList) {
wg.Wait()
}

func printSNRLogsFromNode(node *v1.Node) {
By("printing self node remediation log of healthy node")
pod := findSnrPod(node)
logs, err := utils.GetLogs(k8sClientSet, pod)
Expect(err).ToNot(HaveOccurred())
logger.Info("BEGIN logs of healthy self-node-remediation pod", "name", pod.GetName())
for _, line := range strings.Split(logs, "\n") {
logger.Info(line)
}
logger.Info("END logs of healthy self-node-remediation pod", "name", pod.GetName())
}
//func printSNRLogsFromNode(node *v1.Node) {
// By("printing self node remediation log of healthy node")
// pod := findSnrPod(node)
// logs, err := utils.GetLogs(k8sClientSet, pod)
// Expect(err).ToNot(HaveOccurred())
// logger.Info("BEGIN logs of healthy self-node-remediation pod", "name", pod.GetName())
// for _, line := range strings.Split(logs, "\n") {
// logger.Info(line)
// }
// logger.Info("END logs of healthy self-node-remediation pod", "name", pod.GetName())
//}
5 changes: 5 additions & 0 deletions e2e/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"github.com/go-logr/logr"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/onsi/gomega/format"

"k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
Expand Down Expand Up @@ -37,6 +38,10 @@ func TestE2E(t *testing.T) {
}

var _ = BeforeSuite(func(ctx SpecContext) {

// don't limit log length
format.MaxLength = 0

logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true)))
logger = logf.Log

Expand Down
Loading
Loading