Skip to content

Commit

Permalink
System Test, RDS Core: active PDB failure on drain work-around
Browse files Browse the repository at this point in the history
  • Loading branch information
elenagerman committed Jan 31, 2025
1 parent 4b70722 commit 09b9dc3
Showing 1 changed file with 28 additions and 79 deletions.
107 changes: 28 additions & 79 deletions tests/system-tests/rdscore/internal/rdscorecommon/egress-ip.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@ package rdscorecommon
import (
"context"
"fmt"
policyv1 "k8s.io/api/policy/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"math/rand"
"net"
"net/netip"
"strings"
"time"

policyv1 "k8s.io/api/policy/v1"
"k8s.io/apimachinery/pkg/util/intstr"

"github.com/openshift-kni/eco-goinfra/pkg/bmc"
"github.com/openshift-kni/eco-goinfra/pkg/nodes"
"github.com/openshift-kni/eco-gotests/tests/system-tests/rdscore/internal/rdscoreparams"
Expand Down Expand Up @@ -498,7 +499,7 @@ func gracefulNodeReboot(nodeName string) error {
return fmt.Errorf("failed to retrieve node %s object due to: %w", nodeName, err)
}

glog.V(rdscoreparams.RDSCoreLogLevel).Infof("Cordoning node %q", nodeName)
glog.V(100).Infof("Cordoning node %q", nodeName)

err = nodeObj.Cordon()

Expand All @@ -510,18 +511,18 @@ func gracefulNodeReboot(nodeName string) error {

time.Sleep(5 * time.Second)

glog.V(rdscoreparams.RDSCoreLogLevel).Infof("Set minAvailable=0 to all found PDB to avoid " +
glog.V(100).Infof("Set minAvailable=0 to all found PDB to avoid " +
"Node Drain Failure due to active PodDisruptionBudget")

pdbList, err := setMinAvailableToZeroForActivePDB()
err = setMinAvailableToZeroForActivePDB()

if err != nil {
glog.V(100).Infof("Failed to retrieve pdb list and set minAvailable=0 due to %v", err)

return fmt.Errorf("failed to retrieve pdb list and set minAvailable=0 due to: %w", err)
}

glog.V(rdscoreparams.RDSCoreLogLevel).Infof("Draining node %q", nodeName)
glog.V(100).Infof("Draining node %q", nodeName)

err = nodeObj.Drain()

Expand All @@ -531,17 +532,14 @@ func gracefulNodeReboot(nodeName string) error {
return fmt.Errorf("failed to drain node %q due to %w", nodeName, err)
}

glog.V(rdscoreparams.RDSCoreLogLevel).Infof(
fmt.Sprintf("NodesCredentialsMap:\n\t%#v", RDSCoreConfig.NodesCredentialsMap))
glog.V(100).Infof(fmt.Sprintf("NodesCredentialsMap:\n\t%#v", RDSCoreConfig.NodesCredentialsMap))

var bmcClient *bmc.BMC

glog.V(rdscoreparams.RDSCoreLogLevel).Infof(
fmt.Sprintf("Creating BMC client for node %s", nodeName))
glog.V(100).Infof(fmt.Sprintf("Creating BMC client for node %s", nodeName))

if auth, ok := RDSCoreConfig.NodesCredentialsMap[nodeName]; !ok {
glog.V(rdscoreparams.RDSCoreLogLevel).Infof(
fmt.Sprintf("BMC Details for %q not found", nodeName))
glog.V(100).Infof(fmt.Sprintf("BMC Details for %q not found", nodeName))
Fail(fmt.Sprintf("BMC Details for %q not found", nodeName))
} else {
bmcClient = bmc.New(auth.BMCAddress).
Expand All @@ -556,20 +554,18 @@ func gracefulNodeReboot(nodeName string) error {
true,
func(ctx context.Context) (bool, error) {
if err := bmcClient.SystemForceReset(); err != nil {
glog.V(rdscoreparams.RDSCoreLogLevel).Infof(
fmt.Sprintf("Failed to power cycle %s -> %v", nodeName, err))
glog.V(100).Infof(fmt.Sprintf("Failed to power cycle %s -> %v", nodeName, err))

return false, nil
}

glog.V(rdscoreparams.RDSCoreLogLevel).Infof(
fmt.Sprintf("Successfully powered cycle %s", nodeName))
glog.V(100).Infof(fmt.Sprintf("Successfully powered cycle %s", nodeName))

return true, nil
})

if err != nil {
glog.V(rdscoreparams.RDSCoreLogLevel).Infof("Failed to reboot node %s", nodeName)
glog.V(100).Infof("Failed to reboot node %s", nodeName)

return fmt.Errorf("failed to reboot node %s", nodeName)
}
Expand All @@ -584,16 +580,16 @@ func gracefulNodeReboot(nodeName string) error {
func(ctx context.Context) (bool, error) {
currentNode, err := nodes.Pull(APIClient, nodeName)
if err != nil {
glog.V(rdscoreparams.RDSCoreLogLevel).Infof("Failed to pull node: %v", err)
glog.V(100).Infof("Failed to pull node: %v", err)

return false, nil
}

for _, condition := range currentNode.Object.Status.Conditions {
if condition.Type == rdscoreparams.ConditionTypeReadyString {
if condition.Status != rdscoreparams.ConstantTrueString {
glog.V(rdscoreparams.RDSCoreLogLevel).Infof("Node %q is notReady", currentNode.Definition.Name)
glog.V(rdscoreparams.RDSCoreLogLevel).Infof(" Reason: %s", condition.Reason)
glog.V(100).Infof("Node %q is notReady", currentNode.Definition.Name)
glog.V(100).Infof(" Reason: %s", condition.Reason)

return true, nil
}
Expand All @@ -604,7 +600,7 @@ func gracefulNodeReboot(nodeName string) error {
})

if err != nil {
glog.V(rdscoreparams.RDSCoreLogLevel).Infof("The node %s hasn't reached notReady state", nodeName)
glog.V(100).Infof("The node %s hasn't reached notReady state", nodeName)

return fmt.Errorf("node %s hasn't reached notReady state", nodeName)
}
Expand All @@ -619,16 +615,16 @@ func gracefulNodeReboot(nodeName string) error {
func(ctx context.Context) (bool, error) {
currentNode, err := nodes.Pull(APIClient, nodeName)
if err != nil {
glog.V(rdscoreparams.RDSCoreLogLevel).Infof("Error pulling in node: %v", err)
glog.V(100).Infof("Error pulling in node: %v", err)

return false, nil
}

for _, condition := range currentNode.Object.Status.Conditions {
if condition.Type == rdscoreparams.ConditionTypeReadyString {
if condition.Status == rdscoreparams.ConstantTrueString {
glog.V(rdscoreparams.RDSCoreLogLevel).Infof("Node %q is Ready", currentNode.Definition.Name)
glog.V(rdscoreparams.RDSCoreLogLevel).Infof(" Reason: %s", condition.Reason)
glog.V(100).Infof("Node %q is Ready", currentNode.Definition.Name)
glog.V(100).Infof(" Reason: %s", condition.Reason)

return true, nil
}
Expand All @@ -639,32 +635,23 @@ func gracefulNodeReboot(nodeName string) error {
})

if err != nil {
glog.V(rdscoreparams.RDSCoreLogLevel).Infof("The node %s hasn't reached Ready state", nodeName)
glog.V(100).Infof("The node %s hasn't reached Ready state", nodeName)

return fmt.Errorf("node %s hasn't reached Ready state", nodeName)
}

glog.V(rdscoreparams.RDSCoreLogLevel).Infof("Uncordoning node %q", nodeName)
glog.V(100).Infof("Uncordoning node %q", nodeName)

err = nodeObj.Uncordon()

if err != nil {
glog.V(rdscoreparams.RDSCoreLogLevel).Infof("Failed to uncordon %q due to %v", nodeName, err)
glog.V(100).Infof("Failed to uncordon %q due to %v", nodeName, err)

return fmt.Errorf("failed to uncordon %q due to %w", nodeName, err)
}

time.Sleep(15 * time.Second)

err = restoreActivePDBValues(pdbList)

if err != nil {
glog.V(rdscoreparams.RDSCoreLogLevel).Infof(
"Failed to restore minAvailable value for the active PDBs due to %v", err)

return fmt.Errorf("failed to restore minAvailable value for the active PDBs due to %v", err)
}

return nil
}

Expand Down Expand Up @@ -694,31 +681,15 @@ func getNodeForReboot(isIPv6 bool) (string, string, error) {
return "", "", fmt.Errorf("no egress IP address found in egressIP map")
}

func setMinAvailableToZeroForActivePDB() ([]*poddisruptionbudget.Builder, error) {
func setMinAvailableToZeroForActivePDB() error {
By("Workaround for the node drain failure due to active PodDisruptionBudget")

allAvailablePDB, err := poddisruptionbudget.ListInAllNamespaces(APIClient)

if err != nil {
glog.Infof("Failed to list all available pod disruption budget due to %v", err)

return nil, nil
}

var pdbMinAvailableList []*poddisruptionbudget.Builder

for _, _pdb := range allAvailablePDB {
minAvailableValue := _pdb.Object.Spec.MinAvailable

if minAvailableValue.IntValue() >= 1 {
pdbMinAvailableList = append(pdbMinAvailableList, _pdb)
}
}

if len(pdbMinAvailableList) == 0 {
glog.V(100).Infof("no poddisruptionbudget with the MinAvailable >= 1 found")

return nil, nil
return nil
}

zeroInt := intstr.FromInt32(0)
Expand All @@ -727,33 +698,11 @@ func setMinAvailableToZeroForActivePDB() ([]*poddisruptionbudget.Builder, error)
MinAvailable: &zeroInt,
}

for _, _pdb := range pdbMinAvailableList {
_, err = _pdb.WithPDBSpec(setZeroValue).Update(true)

if err != nil {
glog.V(100).Infof("Failed to update PodDisruptionBudget due to %v", err)

return nil, fmt.Errorf("failed to update PodDisruptionBudget due to %w", err)
}
}

return pdbMinAvailableList, nil
}

func restoreActivePDBValues(pdbMinAvailableList []*poddisruptionbudget.Builder) error {
By("Workaround for the node drain failure due to active PodDisruptionBudget, restore active PDBs values")

oneInt := intstr.FromInt32(1)

restoreValue := policyv1.PodDisruptionBudgetSpec{
MinAvailable: &oneInt,
}

for _, _pdb := range pdbMinAvailableList {
for _, _pdb := range allAvailablePDB {
minAvailableValue := _pdb.Object.Spec.MinAvailable

if minAvailableValue.IntValue() == 0 {
_, err := _pdb.WithPDBSpec(restoreValue).Update(true)
if minAvailableValue != nil && minAvailableValue.IntValue() == 1 {
_, err = _pdb.WithPDBSpec(setZeroValue).Update(false)

if err != nil {
glog.V(100).Infof("Failed to update PodDisruptionBudget due to %v", err)
Expand Down

0 comments on commit 09b9dc3

Please sign in to comment.