@@ -3,6 +3,7 @@ package bootstrapteardown
33import (
44 "context"
55 "fmt"
6+ "go.etcd.io/etcd/api/v3/etcdserverpb"
67 "time"
78
89 operatorv1 "github.com/openshift/api/operator/v1"
@@ -62,12 +63,29 @@ func (c *BootstrapTeardownController) sync(ctx context.Context, _ factory.SyncCo
6263 return fmt .Errorf ("failed to get bootstrap scaling strategy: %w" , err )
6364 }
6465 // checks the actual etcd cluster membership API if etcd-bootstrap exists
65- safeToRemoveBootstrap , hasBootstrap , bootstrapID , err := c .canRemoveEtcdBootstrap (ctx , scalingStrategy )
66+ safeToRemoveBootstrap , hasBootstrap , bootstrapMember , err := c .canRemoveEtcdBootstrap (ctx , scalingStrategy )
6667 if err != nil {
6768 return fmt .Errorf ("error while canRemoveEtcdBootstrap: %w" , err )
6869 }
6970
70- err = c .removeBootstrap (timeoutCtx , safeToRemoveBootstrap , hasBootstrap , bootstrapID )
71+ if hasBootstrap {
72+ if err := c .ensureBootstrapIsNotLeader (ctx , bootstrapMember ); err != nil {
73+ klog .Errorf ("error while ensuring bootstrap is not leader: %v" , err )
74+ }
75+ }
76+
77+ // TODO(thomas): it seems on SNO, this is not enough, we might have a non-working apiserver at this point in time
78+ revisionStable , err := ceohelpers .IsRevisionStable (c .operatorClient )
79+ if err != nil {
80+ return fmt .Errorf ("BootstrapTeardownController failed to determine stability of revisions: %w" , err )
81+ }
82+
83+ if ! revisionStable {
84+ klog .Infof ("BootstrapTeardownController is waiting for stable etcd revision before removing the bootstrap member" )
85+ return nil
86+ }
87+
88+ err = c .removeBootstrap (timeoutCtx , safeToRemoveBootstrap , hasBootstrap , bootstrapMember )
7189 if err != nil {
7290 _ , _ , updateErr := v1helpers .UpdateStatus (ctx , c .operatorClient , v1helpers .UpdateConditionFn (operatorv1.OperatorCondition {
7391 Type : "BootstrapTeardownDegraded" ,
@@ -90,13 +108,20 @@ func (c *BootstrapTeardownController) sync(ctx context.Context, _ factory.SyncCo
90108 return updateErr
91109}
92110
93- func (c * BootstrapTeardownController ) removeBootstrap (ctx context.Context , safeToRemoveBootstrap bool , hasBootstrap bool , bootstrapID uint64 ) error {
111+ func (c * BootstrapTeardownController ) removeBootstrap (ctx context.Context , safeToRemoveBootstrap bool , hasBootstrap bool , bootstrapMember * etcdserverpb.Member ) error {
112+ bootstrapID := uint64 (0 )
113+ bootstrapUrl := "unknown"
114+ if bootstrapMember != nil {
115+ bootstrapID = bootstrapMember .ID
116+ bootstrapUrl = bootstrapMember .GetClientURLs ()[0 ]
117+ }
118+
94119 if ! hasBootstrap {
95120 klog .V (4 ).Infof ("no bootstrap anymore setting removal status" )
96121 // this is to ensure the status is always set correctly, even if the status update below failed
97- updateErr := setSuccessfulBoostrapRemovalStatus (ctx , c .operatorClient )
122+ updateErr := setSuccessfulBootstrapRemovalStatus (ctx , c .operatorClient )
98123 if updateErr != nil {
99- return fmt .Errorf ("error while setSuccessfulBoostrapRemovalStatus : %w" , updateErr )
124+ return fmt .Errorf ("error while setSuccessfulBootstrapRemovalStatus : %w" , updateErr )
100125 }
101126
102127 // if the bootstrap isn't present, then clearly we're available enough to terminate. This avoids any risk of flapping.
@@ -141,20 +166,21 @@ func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeT
141166 if isBootstrapComplete , err := bootstrap .IsBootstrapComplete (c .configmapLister ); ! isBootstrapComplete || err != nil {
142167 return err
143168 }
144- klog .Warningf ("Removing bootstrap member [%x]" , bootstrapID )
169+
170+ klog .Warningf ("Removing bootstrap member [%x] (%s)" , bootstrapID , bootstrapUrl )
145171
146172 // this is ugly until bootkube is updated, but we want to be sure that bootkube has time to be waiting to watch the condition coming back.
147173 if err := c .etcdClient .MemberRemove (ctx , bootstrapID ); err != nil {
148- return fmt .Errorf ("error while removing bootstrap member [%x]: %w" , bootstrapID , err )
174+ return fmt .Errorf ("error while removing bootstrap member [%x] (%s) : %w" , bootstrapID , bootstrapUrl , err )
149175 }
150176
151- klog .Infof ("Successfully removed bootstrap member [%x]" , bootstrapID )
177+ klog .Infof ("Successfully removed bootstrap member [%x] (%s) " , bootstrapID , bootstrapUrl )
152178 // below might fail, since the member removal can cause some downtime for raft to settle on a quorum
153179 // it's important that everything below is properly retried above during normal controller reconciliation
154- return setSuccessfulBoostrapRemovalStatus (ctx , c .operatorClient )
180+ return setSuccessfulBootstrapRemovalStatus (ctx , c .operatorClient )
155181}
156182
157- func setSuccessfulBoostrapRemovalStatus (ctx context.Context , client v1helpers.StaticPodOperatorClient ) error {
183+ func setSuccessfulBootstrapRemovalStatus (ctx context.Context , client v1helpers.StaticPodOperatorClient ) error {
158184 _ , _ , updateErr := v1helpers .UpdateStatus (ctx , client , v1helpers .UpdateConditionFn (operatorv1.OperatorCondition {
159185 Type : "EtcdBootstrapMemberRemoved" ,
160186 Status : operatorv1 .ConditionTrue ,
@@ -165,57 +191,101 @@ func setSuccessfulBoostrapRemovalStatus(ctx context.Context, client v1helpers.St
165191}
166192
167193// canRemoveEtcdBootstrap returns whether it is safe to remove bootstrap, whether bootstrap is in the list, and an error
168- func (c * BootstrapTeardownController ) canRemoveEtcdBootstrap (ctx context.Context , scalingStrategy ceohelpers.BootstrapScalingStrategy ) (bool , bool , uint64 , error ) {
194+ func (c * BootstrapTeardownController ) canRemoveEtcdBootstrap (ctx context.Context , scalingStrategy ceohelpers.BootstrapScalingStrategy ) (bool , bool , * etcdserverpb. Member , error ) {
169195 members , err := c .etcdClient .MemberList (ctx )
170196 if err != nil {
171- return false , false , 0 , err
197+ return false , false , nil , err
172198 }
173199
174200 var hasBootstrap bool
175- var bootstrapMemberID uint64
201+ var bootstrapMember * etcdserverpb. Member
176202 for _ , member := range members {
177203 if member .Name == "etcd-bootstrap" {
178204 hasBootstrap = true
179- bootstrapMemberID = member . ID
205+ bootstrapMember = member
180206 break
181207 }
182208 }
183209 if ! hasBootstrap {
184- return false , hasBootstrap , bootstrapMemberID , nil
210+ return false , hasBootstrap , bootstrapMember , nil
185211 }
186212
187213 // First, enforce the main HA invariants in terms of member counts.
188214 switch scalingStrategy {
189215 case ceohelpers .HAScalingStrategy :
190216 if len (members ) < 4 {
191- return false , hasBootstrap , bootstrapMemberID , nil
217+ return false , hasBootstrap , bootstrapMember , nil
192218 }
193219 case ceohelpers .DelayedHAScalingStrategy :
194220 if len (members ) < 3 {
195- return false , hasBootstrap , bootstrapMemberID , nil
221+ return false , hasBootstrap , bootstrapMember , nil
196222 }
197223 case ceohelpers .UnsafeScalingStrategy :
198224 if len (members ) < 2 {
199- return false , hasBootstrap , bootstrapMemberID , nil
225+ return false , hasBootstrap , bootstrapMember , nil
200226 }
201227 }
202228
203229 // Next, given member counts are satisfied, check member health.
204230 unhealthyMembers , err := c .etcdClient .UnhealthyMembers (ctx )
205231 if err != nil {
206- return false , hasBootstrap , bootstrapMemberID , nil
232+ return false , hasBootstrap , bootstrapMember , nil
207233 }
208234
209235 // the etcd-bootstrap member is allowed to be unhealthy and can still be removed
210236 switch {
211237 case len (unhealthyMembers ) == 0 :
212- return true , hasBootstrap , bootstrapMemberID , nil
238+ return true , hasBootstrap , bootstrapMember , nil
213239 case len (unhealthyMembers ) > 1 :
214- return false , hasBootstrap , bootstrapMemberID , nil
240+ return false , hasBootstrap , bootstrapMember , nil
215241 default :
216242 if unhealthyMembers [0 ].Name == "etcd-bootstrap" {
217- return true , true , unhealthyMembers [0 ].ID , nil
243+ return true , true , bootstrapMember , nil
244+ }
245+ return false , hasBootstrap , bootstrapMember , nil
246+ }
247+ }
248+
249+ func (c * BootstrapTeardownController ) ensureBootstrapIsNotLeader (ctx context.Context , bootstrapMember * etcdserverpb.Member ) error {
250+ if bootstrapMember == nil {
251+ return fmt .Errorf ("bootstrap member was not provided" )
252+ }
253+ status , err := c .etcdClient .Status (ctx , bootstrapMember .ClientURLs [0 ])
254+ if err != nil {
255+ return fmt .Errorf ("could not find bootstrap member status: %w" , err )
256+ }
257+
258+ if bootstrapMember .ID != status .Leader {
259+ return nil
260+ }
261+
262+ klog .Warningf ("Bootstrap member [%x] (%s) detected as leader, trying to move elsewhere..." , bootstrapMember .ID , bootstrapMember .GetClientURLs ()[0 ])
263+
264+ memberHealth , err := c .etcdClient .MemberHealth (ctx )
265+ if err != nil {
266+ return fmt .Errorf ("could not find member health: %w" , err )
267+ }
268+
269+ var otherMember * etcdserverpb.Member
270+ // we can pick any other healthy voting member as the target to move to
271+ for _ , m := range memberHealth .GetHealthyMembers () {
272+ if m .ID != bootstrapMember .ID && ! m .IsLearner {
273+ otherMember = m
274+ break
218275 }
219- return false , hasBootstrap , bootstrapMemberID , nil
220276 }
277+
278+ if otherMember == nil {
279+ return fmt .Errorf ("could not find other healthy member to move leader" )
280+ }
281+
282+ klog .Warningf ("Moving lead from bootstrap member [%x] (%s) detected as leader to [%x] (%s)" , bootstrapMember .ID , bootstrapMember .GetClientURLs ()[0 ], otherMember .ID , otherMember .GetClientURLs ()[0 ])
283+ err = c .etcdClient .MoveLeader (ctx , otherMember .ID )
284+ if err != nil {
285+ return err
286+ }
287+
288+ klog .Warningf ("Moving lead from bootstrap member [%x] (%s) to [%x] (%s) succesfully!" , bootstrapMember .ID , bootstrapMember .GetClientURLs ()[0 ], otherMember .ID , otherMember .GetClientURLs ()[0 ])
289+
290+ return nil
221291}
0 commit comments