Skip to content

Commit c7621a9

Browse files
committed
OCPEDGE-2088, OCPEDGE-1885: Updated state transitions & tests for TNF setup job
- Setup job now sets Available condition only when ExternalEtcd isn't enabled yet (includes test) - Fixed jobcontroller & test setup to handle variable operator status conditions - Added test coverage for updated expectations for job controller condition transitions (Progressing, Degraded, Available)
1 parent 61b7fb5 commit c7621a9

File tree

4 files changed

+309
-28
lines changed

4 files changed

+309
-28
lines changed

pkg/tnf/operator/starter.go

Lines changed: 52 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@ import (
44
"bytes"
55
"context"
66
"fmt"
7-
operatorv1informers "github.com/openshift/client-go/operator/informers/externalversions/operator/v1"
87
"os"
98
"sync"
109

10+
operatorv1informers "github.com/openshift/client-go/operator/informers/externalversions/operator/v1"
11+
1112
operatorv1 "github.com/openshift/api/operator/v1"
1213
configv1informers "github.com/openshift/client-go/config/informers/externalversions"
1314
v1 "github.com/openshift/client-go/config/informers/externalversions/config/v1"
@@ -61,14 +62,30 @@ func HandleDualReplicaClusters(ctx context.Context,
6162
kubeClient kubernetes.Interface,
6263
dynamicClient dynamic.Interface) (bool, error) {
6364

64-
if isDualReplicaTopology, err := isDualReplicaTopoly(ctx, featureGateAccessor, configInformers); err != nil {
65+
// Start the informers and wait for them to sync.
66+
// The config informer is used to check the control plane topology.
67+
// The operator client informer is used to check if CEO is managing etcd, since
68+
// we need to set the available status to false during the transition to external etcd.
69+
configInformers.Start(ctx.Done())
70+
operatorClient.Informer().Run(ctx.Done())
71+
if !cache.WaitForCacheSync(ctx.Done(), configInformers.Config().V1().APIServers().Informer().HasSynced, operatorClient.Informer().HasSynced) {
72+
klog.Fatal("Failed to sync caches for static pod operator client")
73+
}
74+
75+
if isDualReplicaTopology, err := isDualReplicaTopology(ctx, featureGateAccessor, configInformers); err != nil {
6576
return false, err
6677
} else if !isDualReplicaTopology {
6778
return false, nil
6879
}
6980

7081
klog.Infof("detected DualReplica topology")
7182

83+
// We only set the CEO available status to false during the initial transition to external etcd.
84+
initialTransition, err := isCEOManagingEtcd(operatorClient)
85+
if err != nil {
86+
return false, fmt.Errorf("could not determine if we are in the initial setup: %w", err)
87+
}
88+
7289
runExternalEtcdSupportController(ctx, controllerContext, operatorClient, envVarGetter, kubeInformersForNamespaces,
7390
configInformers, networkInformer, controlPlaneNodeInformer, etcdInformer, kubeClient)
7491
runTnfResourceController(ctx, controllerContext, kubeClient, dynamicClient, operatorClient, kubeInformersForNamespaces)
@@ -78,7 +95,7 @@ func HandleDualReplicaClusters(ctx context.Context,
7895
// we need node names for assigning auth and after-setup jobs to specific nodes
7996
var once sync.Once
8097
klog.Infof("watching for nodes...")
81-
_, err := controlPlaneNodeInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
98+
_, err = controlPlaneNodeInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
8299
AddFunc: func(obj interface{}) {
83100
node, ok := obj.(*corev1.Node)
84101
if !ok {
@@ -102,11 +119,22 @@ func HandleDualReplicaClusters(ctx context.Context,
102119
klog.Infof("found 2 control plane nodes (%q, %q), creating TNF jobs", nodeList[0].GetName(), nodeList[1].GetName())
103120
// the order of job creation does not matter, the jobs wait on each other as needed
104121
for _, node := range nodeList {
105-
runJobController(ctx, tools.JobTypeAuth, &node.Name, controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces)
106-
runJobController(ctx, tools.JobTypeAfterSetup, &node.Name, controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces)
122+
runJobController(ctx, tools.JobTypeAuth, &node.Name, controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces, jobs.DefaultConditions)
123+
runJobController(ctx, tools.JobTypeAfterSetup, &node.Name, controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces, jobs.DefaultConditions)
107124
}
108-
runJobController(ctx, tools.JobTypeSetup, nil, controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces)
109-
runJobController(ctx, tools.JobTypeFencing, nil, controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces)
125+
126+
// Make a copy of the default conditions and add the available condition
127+
conditionsWithAvailable := make([]string, len(jobs.DefaultConditions))
128+
copy(conditionsWithAvailable, jobs.DefaultConditions)
129+
conditionsWithAvailable = append(conditionsWithAvailable, operatorv1.OperatorStatusTypeAvailable)
130+
131+
setupConditions := jobs.DefaultConditions
132+
if initialTransition {
133+
setupConditions = conditionsWithAvailable
134+
}
135+
136+
runJobController(ctx, tools.JobTypeSetup, nil, controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces, setupConditions)
137+
runJobController(ctx, tools.JobTypeFencing, nil, controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces, jobs.DefaultConditions)
110138
})
111139
},
112140
})
@@ -138,7 +166,21 @@ func HandleDualReplicaClusters(ctx context.Context,
138166
return true, nil
139167
}
140168

141-
func isDualReplicaTopoly(ctx context.Context, featureGateAccessor featuregates.FeatureGateAccess, configInformers configv1informers.SharedInformerFactory) (bool, error) {
169+
func isCEOManagingEtcd(operatorClient v1helpers.StaticPodOperatorClient) (bool, error) {
170+
// Detect if the cluster is already running in ExternalEtcd mode
171+
operatorSpec, _, _, err := operatorClient.GetStaticPodOperatorState()
172+
if err != nil {
173+
return false, fmt.Errorf("could not get operator spec: %w", err)
174+
}
175+
externalEtcdMode, err := ceohelpers.IsExternalEtcdSupport(operatorSpec)
176+
if err != nil {
177+
return false, fmt.Errorf("could not determine if useExternalEtcdSupport config override is set: %w", err)
178+
}
179+
180+
return !externalEtcdMode, nil
181+
}
182+
183+
func isDualReplicaTopology(ctx context.Context, featureGateAccessor featuregates.FeatureGateAccess, configInformers configv1informers.SharedInformerFactory) (bool, error) {
142184
if isDualReplicaTopology, err := ceohelpers.IsDualReplicaTopology(ctx, configInformers.Config().V1().Infrastructures().Lister()); err != nil {
143185
return false, fmt.Errorf("could not determine DualReplicaTopology, aborting controller start: %w", err)
144186
} else if !isDualReplicaTopology {
@@ -201,7 +243,7 @@ func runTnfResourceController(ctx context.Context, controllerContext *controller
201243
go tnfResourceController.Run(ctx, 1)
202244
}
203245

204-
func runJobController(ctx context.Context, jobType tools.JobType, nodeName *string, controllerContext *controllercmd.ControllerContext, operatorClient v1helpers.StaticPodOperatorClient, kubeClient kubernetes.Interface, kubeInformersForNamespaces v1helpers.KubeInformersForNamespaces) {
246+
func runJobController(ctx context.Context, jobType tools.JobType, nodeName *string, controllerContext *controllercmd.ControllerContext, operatorClient v1helpers.StaticPodOperatorClient, kubeClient kubernetes.Interface, kubeInformersForNamespaces v1helpers.KubeInformersForNamespaces, conditions []string) {
205247
nodeNameForLogs := "n/a"
206248
if nodeName != nil {
207249
nodeNameForLogs = *nodeName
@@ -214,6 +256,7 @@ func runJobController(ctx context.Context, jobType tools.JobType, nodeName *stri
214256
operatorClient,
215257
kubeClient,
216258
kubeInformersForNamespaces.InformersFor(operatorclient.TargetNamespace).Batch().V1().Jobs(),
259+
conditions,
217260
[]factory.Informer{},
218261
[]jobs.JobHookFunc{
219262
func(_ *operatorv1.OperatorSpec, job *batchv1.Job) error {

pkg/tnf/operator/starter_test.go

Lines changed: 114 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@ package operator
22

33
import (
44
"context"
5-
"github.com/stretchr/testify/require"
65
"maps"
76
"testing"
87
"time"
98

9+
"github.com/stretchr/testify/require"
10+
1011
configv1 "github.com/openshift/api/config/v1"
1112
operatorv1 "github.com/openshift/api/operator/v1"
1213

@@ -36,8 +37,49 @@ import (
3637
"github.com/openshift/cluster-etcd-operator/pkg/operator/ceohelpers"
3738
"github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient"
3839
"github.com/openshift/cluster-etcd-operator/pkg/tnf/operator/dualreplicahelpers"
40+
"github.com/openshift/cluster-etcd-operator/pkg/tnf/pkg/jobs"
3941
)
4042

43+
// mockSharedIndexInformer implements cache.SharedIndexInformer with proper Run and HasSynced methods
44+
type mockSharedIndexInformer struct {
45+
cache.SharedIndexInformer
46+
}
47+
48+
func (m *mockSharedIndexInformer) Run(stopCh <-chan struct{}) {
49+
// Mock implementation that just waits for the stop channel
50+
<-stopCh
51+
}
52+
53+
func (m *mockSharedIndexInformer) HasSynced() bool {
54+
return true
55+
}
56+
57+
// createMockOperatorClient creates a StaticPodOperatorClient with a mock informer
58+
func createMockOperatorClient(spec *operatorv1.StaticPodOperatorSpec, status *operatorv1.StaticPodOperatorStatus) v1helpers.StaticPodOperatorClient {
59+
client := v1helpers.NewFakeStaticPodOperatorClient(spec, status, nil, nil)
60+
61+
// Create a mock informer
62+
mockInformer := &mockSharedIndexInformer{
63+
SharedIndexInformer: client.Informer(),
64+
}
65+
66+
// Return a wrapper that uses our mock informer
67+
return &mockOperatorClient{
68+
StaticPodOperatorClient: client,
69+
mockInformer: mockInformer,
70+
}
71+
}
72+
73+
// mockOperatorClient wraps the fake client with our mock informer
74+
type mockOperatorClient struct {
75+
v1helpers.StaticPodOperatorClient
76+
mockInformer *mockSharedIndexInformer
77+
}
78+
79+
func (m *mockOperatorClient) Informer() cache.SharedIndexInformer {
80+
return m.mockInformer
81+
}
82+
4183
type args struct {
4284
ctx context.Context
4385
controllerContext *controllercmd.ControllerContext
@@ -112,6 +154,75 @@ func TestHandleDualReplicaClusters(t *testing.T) {
112154
}
113155
}
114156

157+
func TestSetupJobConditionsBasedOnExternalEtcd(t *testing.T) {
158+
tests := []struct {
159+
name string
160+
externalEtcdEnabled bool
161+
expectedAvailableInSetup bool
162+
}{
163+
{
164+
name: "Job sets the Available condition during initial setup",
165+
externalEtcdEnabled: false,
166+
expectedAvailableInSetup: true,
167+
},
168+
{
169+
name: "Job does not set the Available condition once ExternalEtcd is enabled",
170+
externalEtcdEnabled: true,
171+
expectedAvailableInSetup: false,
172+
},
173+
}
174+
175+
for _, tt := range tests {
176+
t.Run(tt.name, func(t *testing.T) {
177+
// Create operator spec with or without external etcd support
178+
operatorSpec := &operatorv1.StaticPodOperatorSpec{}
179+
if tt.externalEtcdEnabled {
180+
operatorSpec.UnsupportedConfigOverrides = runtime.RawExtension{
181+
Raw: []byte(`{"useExternalEtcdSupport": true}`),
182+
}
183+
}
184+
185+
// Create the operator client with mock informer
186+
fakeOperatorClient := createMockOperatorClient(
187+
operatorSpec,
188+
&operatorv1.StaticPodOperatorStatus{},
189+
)
190+
191+
// Test initialTransition function
192+
initialTransition, err := isCEOManagingEtcd(fakeOperatorClient)
193+
require.NoError(t, err)
194+
195+
// Verify the expected behavior
196+
expectedInitialTransition := !tt.externalEtcdEnabled
197+
require.Equalf(t, expectedInitialTransition, initialTransition,
198+
"initialTransition should return %v when externalEtcdEnabled is %v",
199+
expectedInitialTransition, tt.externalEtcdEnabled)
200+
201+
// Verify setup conditions
202+
conditionsWithAvailable := make([]string, len(jobs.DefaultConditions))
203+
copy(conditionsWithAvailable, jobs.DefaultConditions)
204+
conditionsWithAvailable = append(conditionsWithAvailable, operatorv1.OperatorStatusTypeAvailable)
205+
206+
setupConditions := jobs.DefaultConditions
207+
if initialTransition {
208+
setupConditions = conditionsWithAvailable
209+
}
210+
211+
hasAvailableCondition := false
212+
for _, condition := range setupConditions {
213+
if condition == operatorv1.OperatorStatusTypeAvailable {
214+
hasAvailableCondition = true
215+
break
216+
}
217+
}
218+
219+
require.Equalf(t, tt.expectedAvailableInSetup, hasAvailableCondition,
220+
"Setup job should have Available condition: %v, but got: %v",
221+
tt.expectedAvailableInSetup, hasAvailableCondition)
222+
})
223+
}
224+
}
225+
115226
func getArgs(t *testing.T, dualReplicaControlPlaneEnabled, dualReplicaFeatureGateEnabled bool) args {
116227

117228
fakeKubeClient := fake.NewSimpleClientset()
@@ -131,11 +242,10 @@ func getArgs(t *testing.T, dualReplicaControlPlaneEnabled, dualReplicaFeatureGat
131242
}
132243
fakeConfigClient := fakeconfig.NewSimpleClientset([]runtime.Object{infra}...)
133244

134-
fakeOperatorClient := v1helpers.NewFakeStaticPodOperatorClient(
245+
// Create the operator client with mock informer
246+
fakeOperatorClient := createMockOperatorClient(
135247
&operatorv1.StaticPodOperatorSpec{},
136248
&operatorv1.StaticPodOperatorStatus{},
137-
nil,
138-
nil,
139249
)
140250

141251
enabledFeatureGates := make([]configv1.FeatureGateName, 0)

pkg/tnf/pkg/jobs/jobcontroller.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ import (
2626
"github.com/openshift/cluster-etcd-operator/pkg/tnf/pkg/tools"
2727
)
2828

29+
// By default, we only set the progressing and degraded conditions for TNF setup.
30+
// This is because the operator is only unavailable during the transition of etcd
31+
// ownership between CEO and pacemaker.
32+
var DefaultConditions = []string{opv1.OperatorStatusTypeProgressing, opv1.OperatorStatusTypeDegraded}
33+
2934
// TODO This based on DeploymentController in openshift/library-go
3035
// TODO should be moved there once it proved to be useful
3136

@@ -71,6 +76,7 @@ func NewJobController(
7176
operatorClient v1helpers.OperatorClient,
7277
kubeClient kubernetes.Interface,
7378
jobInformer batchinformersv1.JobInformer,
79+
conditions []string,
7480
optionalInformers []factory.Informer,
7581
optionalJobHooks ...JobHookFunc,
7682
) factory.Controller {
@@ -82,9 +88,7 @@ func NewJobController(
8288
kubeClient,
8389
jobInformer,
8490
).WithConditions(
85-
opv1.OperatorStatusTypeAvailable,
86-
opv1.OperatorStatusTypeProgressing,
87-
opv1.OperatorStatusTypeDegraded,
91+
conditions...,
8892
).WithExtraInformers(
8993
optionalInformers...,
9094
).WithJobHooks(

0 commit comments

Comments
 (0)