Skip to content

Commit

Permalink
raft: mark supporting followers as recently active
Browse files Browse the repository at this point in the history
This commit marks followers who support the current leader as recently
active. This is very important because if we stop heartbeats, we might
not send any MsgApp to the followers because there is nothing new to
append. This means that we will mark the followers as inactive, even
though that they are technically active and supporting the leader.

Fixes: #131349

Release note: None
  • Loading branch information
iskettaneh committed Oct 1, 2024
1 parent 53be53d commit 420b28e
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 3 deletions.
28 changes: 28 additions & 0 deletions pkg/raft/raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -1080,6 +1080,16 @@ func (r *raft) tickHeartbeat() {
r.logger.Debugf("error occurred during checking sending heartbeat: %v", err)
}

// Mark fortifying followers as recently active. We disable heartbeats when
// leader fortification is enabled, instead deferring to StoreLiveness for
// failure detection. As such, if there is no append activity for a raft
// group, it's possible for the leader to not communicate with a follower in
// a electionTimeout. We do not want to infer that the leader can't
// communicate with a follower in such cases, which could then cause the
// leader to spuriously step down because of CheckQuorum. Instead, we
// compute RecentlyActive based on StoreLiveness instead.
r.markFortifyingFollowersAsRecentlyActive()

// Try to refortify any followers that don't currently support us.
r.bcastFortify()
// TODO(ibrahim): add/call maybeUnpauseAndBcastAppend() here.
Expand Down Expand Up @@ -2478,6 +2488,24 @@ func (r *raft) testingStepDown() error {
return nil
}

// markFortifyingFollowersAsRecentlyActive iterates over all the followers, and
// mark them as recently active if they are supporting the leader.
func (r *raft) markFortifyingFollowersAsRecentlyActive() {
if !r.fortificationTracker.FortificationEnabled() {
return
}

r.trk.Visit(func(id pb.PeerID, pr *tracker.Progress) {
if pr.RecentActive {
return // return early as it's already marked as recently active
}

if isFortified, _ := r.fortificationTracker.IsFortifiedBy(id); isFortified {
pr.RecentActive = true
}
})
}

// advanceCommitViaMsgAppOnly returns true if the commit index is advanced on
// the followers using MsgApp only. This means that heartbeats are not used to
// advance the commit index. This function returns true only if all followers
Expand Down
21 changes: 20 additions & 1 deletion pkg/raft/testdata/fortification_checkquorum.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,24 @@ tick-election 1
----
ok

print-fortification-state 1
----
1 : 1
2 : 1
3 : 1

# Even if we haven't received any messages from the follower since last tick,
# we are still okay because the follower is fortifying us which means that it's
# active.
tick-election 1
----
DEBUG 1 has not received messages from a quorum of peers in the last election timeout
ok

status 1
----
1: StateReplicate match=11 next=12 sentCommit=10 matchCommit=10
2: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11
3: StateReplicate match=11 next=12 sentCommit=11 matchCommit=11

# Now, withdraw the StoreLiveness support for the leader's fortified epoch by
# bumping it. This also shows that we won't break our LeadSupportUntil promise
Expand All @@ -52,6 +67,10 @@ bump-epoch 1
2 2 1 1
3 2 1 1

tick-election 1
----
DEBUG 1 does not have store liveness support from a quorum of peers

tick-election 1
----
DEBUG 1 has not received messages from a quorum of peers in the last election timeout
Expand Down
6 changes: 4 additions & 2 deletions pkg/raft/tracker/progress.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,10 @@ type Progress struct {
// case the follower does not erroneously remain in StateSnapshot.
PendingSnapshot uint64

// RecentActive is true if the progress is recently active. Receiving any messages
// from the corresponding follower indicates the progress is active.
// RecentActive is true if the progress is recently active. Receiving any
// messages from the corresponding follower indicates the progress is active.
// Also, it's set to true on every heartbeat timeout if the follower is
// fortifying the leader.
// RecentActive can be reset to false after an election timeout.
// This is always true on the leader.
RecentActive bool
Expand Down

0 comments on commit 420b28e

Please sign in to comment.