diff --git a/pkg/raft/raft.go b/pkg/raft/raft.go index 625a05318e28..94b636b56572 100644 --- a/pkg/raft/raft.go +++ b/pkg/raft/raft.go @@ -1080,6 +1080,16 @@ func (r *raft) tickHeartbeat() { r.logger.Debugf("error occurred during checking sending heartbeat: %v", err) } + // Mark fortifying followers as recently active. We disable heartbeats when + // leader fortification is enabled, instead deferring to StoreLiveness for + // failure detection. As such, if there is no append activity for a raft + // group, it's possible for the leader to not communicate with a follower in + // a electionTimeout. We do not want to infer that the leader can't + // communicate with a follower in such cases, which could then cause the + // leader to spuriously step down because of CheckQuorum. Instead, we + // compute RecentlyActive based on StoreLiveness instead. + r.markFortifyingFollowersAsRecentlyActive() + // Try to refortify any followers that don't currently support us. r.bcastFortify() // TODO(ibrahim): add/call maybeUnpauseAndBcastAppend() here. @@ -2478,6 +2488,24 @@ func (r *raft) testingStepDown() error { return nil } +// markFortifyingFollowersAsRecentlyActive iterates over all the followers, and +// mark them as recently active if they are supporting the leader. +func (r *raft) markFortifyingFollowersAsRecentlyActive() { + if !r.fortificationTracker.FortificationEnabled() { + return + } + + r.trk.Visit(func(id pb.PeerID, pr *tracker.Progress) { + if pr.RecentActive { + return // return early as it's already marked as recently active + } + + if isFortified, _ := r.fortificationTracker.IsFortifiedBy(id); isFortified { + pr.RecentActive = true + } + }) +} + // advanceCommitViaMsgAppOnly returns true if the commit index is advanced on // the followers using MsgApp only. This means that heartbeats are not used to // advance the commit index. This function returns true only if all followers diff --git a/pkg/raft/testdata/fortification_checkquorum.txt b/pkg/raft/testdata/fortification_checkquorum.txt index 69dd42375af5..9b76f6f14df1 100644 --- a/pkg/raft/testdata/fortification_checkquorum.txt +++ b/pkg/raft/testdata/fortification_checkquorum.txt @@ -37,9 +37,24 @@ tick-election 1 ---- ok +print-fortification-state 1 +---- +1 : 1 +2 : 1 +3 : 1 + +# Even if we haven't received any messages from the follower since last tick, +# we are still okay because the follower is fortifying us which means that it's +# active. tick-election 1 ---- -DEBUG 1 has not received messages from a quorum of peers in the last election timeout +ok + +status 1 +---- +1: StateReplicate match=11 next=12 sentCommit=10 matchCommit=10 +2: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11 +3: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11 # Now, withdraw the StoreLiveness support for the leader's fortified epoch by # bumping it. This also shows that we won't break our LeadSupportUntil promise @@ -52,6 +67,10 @@ bump-epoch 1 2 2 1 1 3 2 1 1 +tick-election 1 +---- +DEBUG 1 does not have store liveness support from a quorum of peers + tick-election 1 ---- DEBUG 1 has not received messages from a quorum of peers in the last election timeout diff --git a/pkg/raft/tracker/progress.go b/pkg/raft/tracker/progress.go index c011562f7d08..c3b0caf90c97 100644 --- a/pkg/raft/tracker/progress.go +++ b/pkg/raft/tracker/progress.go @@ -101,8 +101,10 @@ type Progress struct { // case the follower does not erroneously remain in StateSnapshot. PendingSnapshot uint64 - // RecentActive is true if the progress is recently active. Receiving any messages - // from the corresponding follower indicates the progress is active. + // RecentActive is true if the progress is recently active. Receiving any + // messages from the corresponding follower indicates the progress is active. + // Also, it's set to true on every heartbeat timeout if the follower is + // fortifying the leader. // RecentActive can be reset to false after an election timeout. // This is always true on the leader. RecentActive bool