From 420b28e24fe846aca077a1afe0de04106804c9ea Mon Sep 17 00:00:00 2001 From: Ibrahim Kettaneh Date: Wed, 18 Sep 2024 09:21:14 -0400 Subject: [PATCH] raft: mark supporting followers as recently active This commit marks followers who support the current leader as recently active. This is very important because if we stop heartbeats, we might not send any MsgApp to the followers because there is nothing new to append. This means that we will mark the followers as inactive, even though that they are technically active and supporting the leader. Fixes: #131349 Release note: None --- pkg/raft/raft.go | 28 +++++++++++++++++++ .../testdata/fortification_checkquorum.txt | 21 +++++++++++++- pkg/raft/tracker/progress.go | 6 ++-- 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/pkg/raft/raft.go b/pkg/raft/raft.go index 625a05318e28..94b636b56572 100644 --- a/pkg/raft/raft.go +++ b/pkg/raft/raft.go @@ -1080,6 +1080,16 @@ func (r *raft) tickHeartbeat() { r.logger.Debugf("error occurred during checking sending heartbeat: %v", err) } + // Mark fortifying followers as recently active. We disable heartbeats when + // leader fortification is enabled, instead deferring to StoreLiveness for + // failure detection. As such, if there is no append activity for a raft + // group, it's possible for the leader to not communicate with a follower in + // a electionTimeout. We do not want to infer that the leader can't + // communicate with a follower in such cases, which could then cause the + // leader to spuriously step down because of CheckQuorum. Instead, we + // compute RecentlyActive based on StoreLiveness instead. + r.markFortifyingFollowersAsRecentlyActive() + // Try to refortify any followers that don't currently support us. r.bcastFortify() // TODO(ibrahim): add/call maybeUnpauseAndBcastAppend() here. @@ -2478,6 +2488,24 @@ func (r *raft) testingStepDown() error { return nil } +// markFortifyingFollowersAsRecentlyActive iterates over all the followers, and +// mark them as recently active if they are supporting the leader. +func (r *raft) markFortifyingFollowersAsRecentlyActive() { + if !r.fortificationTracker.FortificationEnabled() { + return + } + + r.trk.Visit(func(id pb.PeerID, pr *tracker.Progress) { + if pr.RecentActive { + return // return early as it's already marked as recently active + } + + if isFortified, _ := r.fortificationTracker.IsFortifiedBy(id); isFortified { + pr.RecentActive = true + } + }) +} + // advanceCommitViaMsgAppOnly returns true if the commit index is advanced on // the followers using MsgApp only. This means that heartbeats are not used to // advance the commit index. This function returns true only if all followers diff --git a/pkg/raft/testdata/fortification_checkquorum.txt b/pkg/raft/testdata/fortification_checkquorum.txt index 69dd42375af5..9b76f6f14df1 100644 --- a/pkg/raft/testdata/fortification_checkquorum.txt +++ b/pkg/raft/testdata/fortification_checkquorum.txt @@ -37,9 +37,24 @@ tick-election 1 ---- ok +print-fortification-state 1 +---- +1 : 1 +2 : 1 +3 : 1 + +# Even if we haven't received any messages from the follower since last tick, +# we are still okay because the follower is fortifying us which means that it's +# active. tick-election 1 ---- -DEBUG 1 has not received messages from a quorum of peers in the last election timeout +ok + +status 1 +---- +1: StateReplicate match=11 next=12 sentCommit=10 matchCommit=10 +2: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11 +3: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11 # Now, withdraw the StoreLiveness support for the leader's fortified epoch by # bumping it. This also shows that we won't break our LeadSupportUntil promise @@ -52,6 +67,10 @@ bump-epoch 1 2 2 1 1 3 2 1 1 +tick-election 1 +---- +DEBUG 1 does not have store liveness support from a quorum of peers + tick-election 1 ---- DEBUG 1 has not received messages from a quorum of peers in the last election timeout diff --git a/pkg/raft/tracker/progress.go b/pkg/raft/tracker/progress.go index c011562f7d08..c3b0caf90c97 100644 --- a/pkg/raft/tracker/progress.go +++ b/pkg/raft/tracker/progress.go @@ -101,8 +101,10 @@ type Progress struct { // case the follower does not erroneously remain in StateSnapshot. PendingSnapshot uint64 - // RecentActive is true if the progress is recently active. Receiving any messages - // from the corresponding follower indicates the progress is active. + // RecentActive is true if the progress is recently active. Receiving any + // messages from the corresponding follower indicates the progress is active. + // Also, it's set to true on every heartbeat timeout if the follower is + // fortifying the leader. // RecentActive can be reset to false after an election timeout. // This is always true on the leader. RecentActive bool