Skip to content

Commit

Permalink
set rollout start date and don't start updating if rollout just chang…
Browse files Browse the repository at this point in the history
…ed (#50365)

This commit does two changes:
- the controller now sets the rollout start time when resetting the
  rollout
- the controller will not start a group if the rollout changed during
  the maintenance window (checks if the rollout start time is in the
  window)
  • Loading branch information
hugoShaka authored Dec 19, 2024
1 parent 47c0216 commit aaea047
Show file tree
Hide file tree
Showing 11 changed files with 232 additions and 57 deletions.
1 change: 1 addition & 0 deletions api/gen/proto/go/teleport/autoupdate/v1/autoupdate.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions api/proto/teleport/autoupdate/v1/autoupdate.proto
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ message AutoUpdateAgentRolloutStatus {
// For example, a group updates every day between 13:00 and 14:00. If the target version changes to 13:30, the group
// will not start updating to the new version directly. The controller sees that the group theoretical start time is
// before the rollout start time and the maintenance window belongs to the previous rollout.
// When the timestamp is nil, the controller will ignore the start time and check and allow groups to activate.
google.protobuf.Timestamp start_time = 3;
}

Expand Down
1 change: 1 addition & 0 deletions lib/autoupdate/rollout/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ func NewController(client Client, log *slog.Logger, clock clockwork.Clock, perio
reconciler: reconciler{
clt: client,
log: log,
clock: clock,
rolloutStrategies: []rolloutStrategy{
// TODO(hugoShaka): add the strategies here as we implement them
},
Expand Down
12 changes: 7 additions & 5 deletions lib/autoupdate/rollout/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ func (r *reconciler) tryReconcile(ctx context.Context) error {
return trace.Wrap(err, "computing rollout status")
}

// there was an existing rollout, we must figure if something changed
// We compute if something changed.
specChanged := !proto.Equal(existingRollout.GetSpec(), newSpec)
statusChanged := !proto.Equal(existingRollout.GetStatus(), newStatus)
rolloutChanged := specChanged || statusChanged
Expand Down Expand Up @@ -273,6 +273,8 @@ func (r *reconciler) computeStatus(
// We create a new status if the rollout should be reset or the previous status was nil
if shouldResetRollout || existingRollout.GetStatus() == nil {
status = new(autoupdate.AutoUpdateAgentRolloutStatus)
// We set the start time if this is a new rollout
status.StartTime = timestamppb.New(r.clock.Now())
} else {
status = utils.CloneProtoMsg(existingRollout.GetStatus())
}
Expand Down Expand Up @@ -302,16 +304,16 @@ func (r *reconciler) computeStatus(
return nil, trace.Wrap(err, "creating groups status")
}
}
status.Groups = groups

err = r.progressRollout(ctx, newSpec.GetStrategy(), groups)
err = r.progressRollout(ctx, newSpec.GetStrategy(), status)
// Failing to progress the update is not a hard failure.
// We want to update the status even if something went wrong to surface the failed reconciliation and potential errors to the user.
if err != nil {
r.log.ErrorContext(ctx, "Errors encountered during rollout progress. Some groups might not get updated properly.",
"error", err)
}

status.Groups = groups
status.State = computeRolloutState(groups)
return status, nil
}
Expand All @@ -320,10 +322,10 @@ func (r *reconciler) computeStatus(
// groups are updated in place.
// If an error is returned, the groups should still be upserted, depending on the strategy,
// failing to update a group might not be fatal (other groups can still progress independently).
func (r *reconciler) progressRollout(ctx context.Context, strategyName string, groups []*autoupdate.AutoUpdateAgentRolloutStatusGroup) error {
func (r *reconciler) progressRollout(ctx context.Context, strategyName string, status *autoupdate.AutoUpdateAgentRolloutStatus) error {
for _, strategy := range r.rolloutStrategies {
if strategy.name() == strategyName {
return strategy.progressRollout(ctx, groups)
return strategy.progressRollout(ctx, status)
}
}
return trace.NotImplemented("rollout strategy %q not implemented", strategyName)
Expand Down
57 changes: 36 additions & 21 deletions lib/autoupdate/rollout/reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ func TestTryReconcile(t *testing.T) {
t.Parallel()
log := utils.NewSlogLoggerForTests()
ctx := context.Background()
clock := clockwork.NewFakeClock()

// Test setup: creating fixtures
configOK, err := update.NewAutoUpdateConfig(&autoupdate.AutoUpdateConfigSpec{
Tools: &autoupdate.AutoUpdateConfigSpecTools{
Expand Down Expand Up @@ -186,7 +188,7 @@ func TestTryReconcile(t *testing.T) {
Strategy: update.AgentsStrategyHaltOnError,
})
require.NoError(t, err)
upToDateRollout.Status = &autoupdate.AutoUpdateAgentRolloutStatus{}
upToDateRollout.Status = &autoupdate.AutoUpdateAgentRolloutStatus{StartTime: timestamppb.New(clock.Now())}

outOfDateRollout, err := update.NewAutoUpdateAgentRollout(&autoupdate.AutoUpdateAgentRolloutSpec{
StartVersion: "1.2.2",
Expand Down Expand Up @@ -315,8 +317,9 @@ func TestTryReconcile(t *testing.T) {
// Test execution: Running the reconciliation

reconciler := &reconciler{
clt: client,
log: log,
clt: client,
log: log,
clock: clock,
}

require.NoError(t, reconciler.tryReconcile(ctx))
Expand All @@ -330,6 +333,7 @@ func TestTryReconcile(t *testing.T) {
func TestReconciler_Reconcile(t *testing.T) {
log := utils.NewSlogLoggerForTests()
ctx := context.Background()
clock := clockwork.NewFakeClock()
// Test setup: creating fixtures
config, err := update.NewAutoUpdateConfig(&autoupdate.AutoUpdateConfigSpec{
Tools: &autoupdate.AutoUpdateConfigSpecTools{
Expand Down Expand Up @@ -361,7 +365,7 @@ func TestReconciler_Reconcile(t *testing.T) {
Strategy: update.AgentsStrategyHaltOnError,
})
require.NoError(t, err)
upToDateRollout.Status = &autoupdate.AutoUpdateAgentRolloutStatus{}
upToDateRollout.Status = &autoupdate.AutoUpdateAgentRolloutStatus{StartTime: timestamppb.New(clock.Now())}

outOfDateRollout, err := update.NewAutoUpdateAgentRollout(&autoupdate.AutoUpdateAgentRolloutSpec{
StartVersion: "1.2.2",
Expand Down Expand Up @@ -407,8 +411,9 @@ func TestReconciler_Reconcile(t *testing.T) {

client := newMockClient(t, stubs)
reconciler := &reconciler{
clt: client,
log: log,
clt: client,
log: log,
clock: clock,
}

// Test execution: run the reconciliation loop
Expand All @@ -431,8 +436,9 @@ func TestReconciler_Reconcile(t *testing.T) {

client := newMockClient(t, stubs)
reconciler := &reconciler{
clt: client,
log: log,
clt: client,
log: log,
clock: clock,
}

// Test execution: run the reconciliation loop
Expand Down Expand Up @@ -471,8 +477,9 @@ func TestReconciler_Reconcile(t *testing.T) {

client := newMockClient(t, stubs)
reconciler := &reconciler{
clt: client,
log: log,
clt: client,
log: log,
clock: clock,
}

// Test execution: run the reconciliation loop
Expand Down Expand Up @@ -509,8 +516,9 @@ func TestReconciler_Reconcile(t *testing.T) {

client := newMockClient(t, stubs)
reconciler := &reconciler{
clt: client,
log: log,
clt: client,
log: log,
clock: clock,
}

// Test execution: run the reconciliation loop
Expand All @@ -533,8 +541,9 @@ func TestReconciler_Reconcile(t *testing.T) {

client := newMockClient(t, stubs)
reconciler := &reconciler{
clt: client,
log: log,
clt: client,
log: log,
clock: clock,
}

// Test execution: run the reconciliation loop
Expand Down Expand Up @@ -563,8 +572,9 @@ func TestReconciler_Reconcile(t *testing.T) {

client := newMockClient(t, stubs)
reconciler := &reconciler{
clt: client,
log: log,
clt: client,
log: log,
clock: clock,
}

// Test execution: run the reconciliation loop
Expand Down Expand Up @@ -704,7 +714,7 @@ func (f *fakeRolloutStrategy) name() string {
return f.strategyName
}

func (f *fakeRolloutStrategy) progressRollout(ctx context.Context, groups []*autoupdate.AutoUpdateAgentRolloutStatusGroup) error {
func (f *fakeRolloutStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus) error {
f.calls++
return nil
}
Expand Down Expand Up @@ -742,8 +752,9 @@ func Test_reconciler_computeStatus(t *testing.T) {
newGroups, err := r.makeGroupsStatus(ctx, schedules, clock.Now())
require.NoError(t, err)
newStatus := &autoupdate.AutoUpdateAgentRolloutStatus{
Groups: newGroups,
State: autoupdate.AutoUpdateAgentRolloutState_AUTO_UPDATE_AGENT_ROLLOUT_STATE_UNSTARTED,
Groups: newGroups,
State: autoupdate.AutoUpdateAgentRolloutState_AUTO_UPDATE_AGENT_ROLLOUT_STATE_UNSTARTED,
StartTime: timestamppb.New(clock.Now()),
}

tests := []struct {
Expand Down Expand Up @@ -835,15 +846,19 @@ func Test_reconciler_computeStatus(t *testing.T) {
Strategy: fakeRolloutStrategyName,
},
// groups should be unset
expectedStatus: &autoupdate.AutoUpdateAgentRolloutStatus{},
expectedStatus: &autoupdate.AutoUpdateAgentRolloutStatus{
StartTime: timestamppb.New(clock.Now()),
},
expectedStrategyCalls: 0,
},
{
name: "new groups are populated if previous ones were empty",
existingRollout: &autoupdate.AutoUpdateAgentRollout{
Spec: oldSpec,
// old groups were empty
Status: &autoupdate.AutoUpdateAgentRolloutStatus{},
Status: &autoupdate.AutoUpdateAgentRolloutStatus{
StartTime: timestamppb.New(clock.Now()),
},
},
// no spec change
newSpec: oldSpec,
Expand Down
13 changes: 12 additions & 1 deletion lib/autoupdate/rollout/strategy.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,14 @@ const (
// Common update reasons
updateReasonCreated = "created"
updateReasonReconcilerError = "reconciler_error"
updateReasonRolloutChanged = "rollout_changed_during_window"
)

// rolloutStrategy is responsible for rolling out the update across groups.
// This interface allows us to inject dummy strategies for simpler testing.
type rolloutStrategy interface {
name() string
progressRollout(context.Context, []*autoupdate.AutoUpdateAgentRolloutStatusGroup) error
progressRollout(context.Context, *autoupdate.AutoUpdateAgentRolloutStatus) error
}

func inWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time) (bool, error) {
Expand All @@ -53,6 +54,16 @@ func inWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now time.Time
return int(group.ConfigStartHour) == now.Hour(), nil
}

// rolloutChangedInWindow checks if the rollout got created after the theoretical group start time
func rolloutChangedInWindow(group *autoupdate.AutoUpdateAgentRolloutStatusGroup, now, rolloutStart time.Time) (bool, error) {
// If the rollout is older than 24h, we know it did not change during the window
if now.Sub(rolloutStart) > 24*time.Hour {
return false, nil
}
// Else we check if the rollout happened in the group window.
return inWindow(group, rolloutStart)
}

func canUpdateToday(allowedDays []string, now time.Time) (bool, error) {
for _, allowedDay := range allowedDays {
if allowedDay == types.Wildcard {
Expand Down
39 changes: 27 additions & 12 deletions lib/autoupdate/rollout/strategy_haltonerror.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ func newHaltOnErrorStrategy(log *slog.Logger, clock clockwork.Clock) (rolloutStr
}, nil
}

func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, groups []*autoupdate.AutoUpdateAgentRolloutStatusGroup) error {
func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, status *autoupdate.AutoUpdateAgentRolloutStatus) error {
now := h.clock.Now()
// We process every group in order, all the previous groups must be in the DONE state
// for the next group to become active. Even if some early groups are not DONE,
Expand All @@ -72,12 +72,12 @@ func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, groups []*aut
// to transition "staging" to DONE.
previousGroupsAreDone := true

for i, group := range groups {
for i, group := range status.Groups {
switch group.State {
case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_UNSTARTED:
var previousGroup *autoupdate.AutoUpdateAgentRolloutStatusGroup
if i != 0 {
previousGroup = groups[i-1]
previousGroup = status.Groups[i-1]
}
canStart, err := canStartHaltOnError(group, previousGroup, now)
if err != nil {
Expand All @@ -86,16 +86,31 @@ func (h *haltOnErrorStrategy) progressRollout(ctx context.Context, groups []*aut
setGroupState(group, group.State, updateReasonReconcilerError, now)
return err
}

// Check if the rollout got created after the theoretical group start time
rolloutChangedDuringWindow, err := rolloutChangedInWindow(group, now, status.StartTime.AsTime())
if err != nil {
setGroupState(group, group.State, updateReasonReconcilerError, now)
return err
}

switch {
case previousGroupsAreDone && canStart:
// We can start
setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, updateReasonCanStart, now)
case previousGroupsAreDone:
// All previous groups are OK, but time-related criteria are not OK
case !previousGroupsAreDone:
// All previous groups are not DONE
setGroupState(group, group.State, updateReasonPreviousGroupsNotDone, now)
case !canStart:
// All previous groups are DONE, but time-related criteria are not met
// This can be because we are outside an update window, or because the
// specified wait_hours doesn't let us update yet.
setGroupState(group, group.State, updateReasonCannotStart, now)
case rolloutChangedDuringWindow:
// All previous groups are DONE and time-related criteria are met.
// However, the rollout changed during the maintenance window.
setGroupState(group, group.State, updateReasonRolloutChanged, now)
default:
// At least one previous group is not DONE
setGroupState(group, group.State, updateReasonPreviousGroupsNotDone, now)
// All previous groups are DONE and time-related criteria are met.
// We can start.
setGroupState(group, autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ACTIVE, updateReasonCanStart, now)
}
previousGroupsAreDone = false
case autoupdate.AutoUpdateAgentGroupState_AUTO_UPDATE_AGENT_GROUP_STATE_ROLLEDBACK:
Expand Down Expand Up @@ -132,10 +147,10 @@ func canStartHaltOnError(group, previousGroup *autoupdate.AutoUpdateAgentRollout

previousStart := previousGroup.StartTime.AsTime()
if previousStart.IsZero() || previousStart.Unix() == 0 {
return false, trace.BadParameter("the previous group doesn't have a start time, cannot check the 'wait_hours' criteria")
return false, trace.BadParameter("the previous group doesn't have a start time, cannot check the 'wait_hours' criterion")
}

// Check if the wait_hours criteria is OK, if we are at least after 'wait_hours' hours since the previous start.
// Check if the wait_hours criterion is OK, if we are at least after 'wait_hours' hours since the previous start.
if now.Before(previousGroup.StartTime.AsTime().Add(time.Duration(group.ConfigWaitHours) * time.Hour)) {
return false, nil
}
Expand Down
Loading

0 comments on commit aaea047

Please sign in to comment.