Skip to content

Commit

Permalink
raft: skip fortifying followers with no liveness support epoch
Browse files Browse the repository at this point in the history
This commit makes the leader skip trying to fortify followers which are
not supported in store liveness.
  • Loading branch information
iskettaneh committed Sep 26, 2024
1 parent b5e8f81 commit e20bf34
Show file tree
Hide file tree
Showing 7 changed files with 279 additions and 54 deletions.
46 changes: 36 additions & 10 deletions pkg/raft/raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -731,13 +731,43 @@ func (r *raft) sendHeartbeat(to pb.PeerID) {
pr.MaybeUpdateSentCommit(commit)
}

// sendFortify sends a fortification RPC to the given peer.
func (r *raft) sendFortify(to pb.PeerID) {
// maybeSendFortify sends a fortification RPC to the given peer if it isn't
// fortified but the peer's store supports the leader's store in StoreLiveness.
func (r *raft) maybeSendFortify(id pb.PeerID) {
if !r.storeLiveness.SupportFromEnabled() {
// The underlying store liveness fabric hasn't been enabled to allow the
// leader to request support from peers. No-op.
return
}

isFortified, isSupported := r.fortificationTracker.IsFortifiedBy(id)

if isFortified {
return // return early if the follower's fortified.
}

if !isSupported {
// If the follower isn't providing active store liveness support to the
// leader, or it is but the leader isn't hearing about it, we don't need to
// send a fortify message. We will attempt to fortify the follower once
// store liveness support is established.
if id == r.id {
// Log if the leader doesn't support itself in the liveness fabric. This
// is possible if the leader is affected by disk stalls.
r.logger.Infof(
"%x leader at term %d does not support itself in the liveness fabric", r.id, r.Term,
)
}
return
}

// Only send a fortify message if we don't know that the follower supports us
// at the current epoch.
r.sendFortify(id)
}

// sendFortify sends a fortification RPC to the given peer.
func (r *raft) sendFortify(to pb.PeerID) {
if to == r.id {
// We handle the case where the leader is trying to fortify itself specially.
// Doing so avoids a self-addressed message.
Expand All @@ -752,10 +782,6 @@ func (r *raft) sendFortify(to pb.PeerID) {
// discrimination for who is providing support (itself vs. other
// follower).
r.send(pb.Message{To: r.id, Type: pb.MsgFortifyLeaderResp, LeadEpoch: epoch})
} else {
r.logger.Infof(
"%x leader at term %d does not support itself in the liveness fabric", r.id, r.Term,
)
}
return
}
Expand Down Expand Up @@ -787,13 +813,13 @@ func (r *raft) bcastHeartbeat() {
})
}

// bcastFortify sends an RPC to fortify the leader to all peers (including the
// leader itself).
// bcastFortify attempts to send an RPC to fortify the leader to all the peers
// (including the leader itself) whose stores are currently providing store
// liveness support to the leader's store but who have not fortified the leader.
func (r *raft) bcastFortify() {
assertTrue(r.state == StateLeader, "only leaders can fortify")

r.trk.Visit(func(id pb.PeerID, _ *tracker.Progress) {
r.sendFortify(id)
r.maybeSendFortify(id)
})
}

Expand Down
11 changes: 10 additions & 1 deletion pkg/raft/rafttest/interaction_env_handler_add_nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ func (env *InteractionEnv) AddNodes(n int, cfg raft.Config, snap pb.Snapshot) er
cfg := cfg // fork the config stub
cfg.ID, cfg.Storage = id, s

env.Fabric.addNode()
cfg.StoreLiveness = newStoreLiveness(env.Fabric, id)

// If the node creating command hasn't specified the CRDBVersion, use the
Expand Down Expand Up @@ -173,5 +172,15 @@ func (env *InteractionEnv) AddNodes(n int, cfg raft.Config, snap pb.Snapshot) er
}
env.Nodes = append(env.Nodes, node)
}

// The potential store nodes is the max between the number of nodes in the env
// and the sum of voters and learners. Add the difference between the
// potential nodes and the current store nodes.
allPotential := max(len(env.Nodes),
len(snap.Metadata.ConfState.Voters)+len(snap.Metadata.ConfState.Learners))
curNodesCount := len(env.Fabric.state) - 1 // 1-indexed stores
for rem := allPotential - curNodesCount; rem > 0; rem-- {
env.Fabric.addNode()
}
return nil
}
132 changes: 99 additions & 33 deletions pkg/raft/testdata/fortification_basic.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,56 +4,65 @@ log-level info
----
ok

add-nodes 3 voters=(1,2,3) index=2
add-nodes 4 voters=(1,2,3,4) index=2
----
INFO 1 switched to configuration voters=(1 2 3)
INFO 1 switched to configuration voters=(1 2 3 4)
INFO 1 became follower at term 0
INFO newRaft 1 [peers: [1,2,3], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]
INFO 2 switched to configuration voters=(1 2 3)
INFO newRaft 1 [peers: [1,2,3,4], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]
INFO 2 switched to configuration voters=(1 2 3 4)
INFO 2 became follower at term 0
INFO newRaft 2 [peers: [1,2,3], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]
INFO 3 switched to configuration voters=(1 2 3)
INFO newRaft 2 [peers: [1,2,3,4], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]
INFO 3 switched to configuration voters=(1 2 3 4)
INFO 3 became follower at term 0
INFO newRaft 3 [peers: [1,2,3], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]
INFO newRaft 3 [peers: [1,2,3,4], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]
INFO 4 switched to configuration voters=(1 2 3 4)
INFO 4 became follower at term 0
INFO newRaft 4 [peers: [1,2,3,4], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1]

# Muck around with StoreLiveness to make it somewhat interesting.
bump-epoch 1
----
1 2 3
1 2 1 1
2 2 1 1
3 2 1 1
1 2 3 4
1 2 1 1 1
2 2 1 1 1
3 2 1 1 1
4 2 1 1 1

withdraw-support 1 1
----
1 2 3
1 x 1 1
2 2 1 1
3 2 1 1
1 2 3 4
1 x 1 1 1
2 2 1 1 1
3 2 1 1 1
4 2 1 1 1

grant-support 1 1
----
1 2 3
1 3 1 1
2 2 1 1
3 2 1 1

withdraw-support 3 1
----
1 2 3
1 3 1 1
2 2 1 1
3 x 1 1

1 2 3 4
1 3 1 1 1
2 2 1 1 1
3 2 1 1 1
4 2 1 1 1

campaign 1
----
INFO 1 is starting a new election at term 0
INFO 1 became candidate at term 1
INFO 1 [logterm: 1, index: 2] sent MsgVote request to 2 at term 1
INFO 1 [logterm: 1, index: 2] sent MsgVote request to 3 at term 1
INFO 1 [logterm: 1, index: 2] sent MsgVote request to 4 at term 1

stabilize
# Node 3 withdraws its support for node 1.
# Node 4 will withdraw support after the fortification message is sent.
withdraw-support 3 1
----
1 2 3 4
1 3 1 1 1
2 2 1 1 1
3 x 1 1 1
4 2 1 1 1

stabilize 1
----
> 1 handling Ready
Ready MustSync=true:
Expand All @@ -62,8 +71,12 @@ stabilize
Messages:
1->2 MsgVote Term:1 Log:1/2
1->3 MsgVote Term:1 Log:1/2
1->4 MsgVote Term:1 Log:1/2
INFO 1 received MsgVoteResp from 1 at term 1
INFO 1 has received 1 MsgVoteResp votes and 0 vote rejections

stabilize 2 3 4
----
> 2 receiving messages
1->2 MsgVote Term:1 Log:1/2
INFO 2 [term: 0] received a MsgVote message with higher term from 1 [term: 1]
Expand All @@ -74,6 +87,11 @@ stabilize
INFO 3 [term: 0] received a MsgVote message with higher term from 1 [term: 1]
INFO 3 became follower at term 1
INFO 3 [logterm: 1, index: 2, vote: 0] cast MsgVote for 1 [logterm: 1, index: 2] at term 1
> 4 receiving messages
1->4 MsgVote Term:1 Log:1/2
INFO 4 [term: 0] received a MsgVote message with higher term from 1 [term: 1]
INFO 4 became follower at term 1
INFO 4 [logterm: 1, index: 2, vote: 0] cast MsgVote for 1 [logterm: 1, index: 2] at term 1
> 2 handling Ready
Ready MustSync=true:
HardState Term:1 Vote:1 Commit:2 Lead:0 LeadEpoch:0
Expand All @@ -84,12 +102,25 @@ stabilize
HardState Term:1 Vote:1 Commit:2 Lead:0 LeadEpoch:0
Messages:
3->1 MsgVoteResp Term:1 Log:0/0
> 4 handling Ready
Ready MustSync=true:
HardState Term:1 Vote:1 Commit:2 Lead:0 LeadEpoch:0
Messages:
4->1 MsgVoteResp Term:1 Log:0/0

# Since node 3 withdrew its support, node 1 will not send a MsgFortifyLeader to
# it.
stabilize 1
----
> 1 receiving messages
2->1 MsgVoteResp Term:1 Log:0/0
INFO 1 received MsgVoteResp from 2 at term 1
INFO 1 has received 2 MsgVoteResp votes and 0 vote rejections
INFO 1 became leader at term 1
3->1 MsgVoteResp Term:1 Log:0/0
INFO 1 received MsgVoteResp from 3 at term 1
INFO 1 has received 3 MsgVoteResp votes and 0 vote rejections
INFO 1 became leader at term 1
4->1 MsgVoteResp Term:1 Log:0/0
> 1 handling Ready
Ready MustSync=true:
State:StateLeader
Expand All @@ -98,15 +129,31 @@ stabilize
1/3 EntryNormal ""
Messages:
1->2 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->4 MsgFortifyLeader Term:1 Log:0/0
1->2 MsgApp Term:1 Log:1/2 Commit:2 Entries:[1/3 EntryNormal ""]
1->3 MsgApp Term:1 Log:1/2 Commit:2 Entries:[1/3 EntryNormal ""]
1->4 MsgApp Term:1 Log:1/2 Commit:2 Entries:[1/3 EntryNormal ""]

withdraw-support 4 1
----
1 2 3 4
1 3 1 1 1
2 2 1 1 1
3 x 1 1 1
4 x 1 1 1

# Since node 4 withdrew its support after MsgFortifyLeader is sent, node 4 will
# reject the MsgFortifyLeader message.
stabilize
----
> 2 receiving messages
1->2 MsgFortifyLeader Term:1 Log:0/0
1->2 MsgApp Term:1 Log:1/2 Commit:2 Entries:[1/3 EntryNormal ""]
> 3 receiving messages
1->3 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgApp Term:1 Log:1/2 Commit:2 Entries:[1/3 EntryNormal ""]
> 4 receiving messages
1->4 MsgFortifyLeader Term:1 Log:0/0
1->4 MsgApp Term:1 Log:1/2 Commit:2 Entries:[1/3 EntryNormal ""]
> 2 handling Ready
Ready MustSync=true:
HardState Term:1 Vote:1 Commit:2 Lead:1 LeadEpoch:2
Expand All @@ -121,13 +168,21 @@ stabilize
Entries:
1/3 EntryNormal ""
Messages:
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 Rejected (Hint: 0)
3->1 MsgAppResp Term:1 Log:0/3 Commit:2
> 4 handling Ready
Ready MustSync=true:
HardState Term:1 Vote:1 Commit:2 Lead:1 LeadEpoch:0
Entries:
1/3 EntryNormal ""
Messages:
4->1 MsgFortifyLeaderResp Term:1 Log:0/0 Rejected (Hint: 0)
4->1 MsgAppResp Term:1 Log:0/3 Commit:2
> 1 receiving messages
2->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
2->1 MsgAppResp Term:1 Log:0/3 Commit:2
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 Rejected (Hint: 0)
3->1 MsgAppResp Term:1 Log:0/3 Commit:2
4->1 MsgFortifyLeaderResp Term:1 Log:0/0 Rejected (Hint: 0)
4->1 MsgAppResp Term:1 Log:0/3 Commit:2
> 1 handling Ready
Ready MustSync=true:
HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:3
Expand All @@ -136,10 +191,13 @@ stabilize
Messages:
1->2 MsgApp Term:1 Log:1/3 Commit:3
1->3 MsgApp Term:1 Log:1/3 Commit:3
1->4 MsgApp Term:1 Log:1/3 Commit:3
> 2 receiving messages
1->2 MsgApp Term:1 Log:1/3 Commit:3
> 3 receiving messages
1->3 MsgApp Term:1 Log:1/3 Commit:3
> 4 receiving messages
1->4 MsgApp Term:1 Log:1/3 Commit:3
> 2 handling Ready
Ready MustSync=true:
HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:2
Expand All @@ -154,6 +212,14 @@ stabilize
1/3 EntryNormal ""
Messages:
3->1 MsgAppResp Term:1 Log:0/3 Commit:3
> 4 handling Ready
Ready MustSync=true:
HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:0
CommittedEntries:
1/3 EntryNormal ""
Messages:
4->1 MsgAppResp Term:1 Log:0/3 Commit:3
> 1 receiving messages
2->1 MsgAppResp Term:1 Log:0/3 Commit:3
3->1 MsgAppResp Term:1 Log:0/3 Commit:3
4->1 MsgAppResp Term:1 Log:0/3 Commit:3
8 changes: 0 additions & 8 deletions pkg/raft/testdata/fortification_support_tracking.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,36 +81,28 @@ stabilize
Entries:
1/11 EntryNormal ""
Messages:
1->2 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->2 MsgApp Term:1 Log:1/10 Commit:10 Entries:[1/11 EntryNormal ""]
1->3 MsgApp Term:1 Log:1/10 Commit:10 Entries:[1/11 EntryNormal ""]
> 2 receiving messages
1->2 MsgFortifyLeader Term:1 Log:0/0
1->2 MsgApp Term:1 Log:1/10 Commit:10 Entries:[1/11 EntryNormal ""]
> 3 receiving messages
1->3 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgApp Term:1 Log:1/10 Commit:10 Entries:[1/11 EntryNormal ""]
> 2 handling Ready
Ready MustSync=true:
HardState Term:1 Vote:1 Commit:10 Lead:1 LeadEpoch:0
Entries:
1/11 EntryNormal ""
Messages:
2->1 MsgFortifyLeaderResp Term:1 Log:0/0 Rejected (Hint: 0)
2->1 MsgAppResp Term:1 Log:0/11 Commit:10
> 3 handling Ready
Ready MustSync=true:
HardState Term:1 Vote:1 Commit:10 Lead:1 LeadEpoch:0
Entries:
1/11 EntryNormal ""
Messages:
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 Rejected (Hint: 0)
3->1 MsgAppResp Term:1 Log:0/11 Commit:10
> 1 receiving messages
2->1 MsgFortifyLeaderResp Term:1 Log:0/0 Rejected (Hint: 0)
2->1 MsgAppResp Term:1 Log:0/11 Commit:10
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 Rejected (Hint: 0)
3->1 MsgAppResp Term:1 Log:0/11 Commit:10
> 1 handling Ready
Ready MustSync=true:
Expand Down
7 changes: 5 additions & 2 deletions pkg/raft/testdata/snapshot_succeed_via_app_resp.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ log-level none
ok

# Start with two nodes, but the config already has a third.
add-nodes 2 voters=(1,2,3) index=10
# We set store-liveness-nodes=3 because we add 3 voters despite having 2 nodes.
add-nodes 2 voters=(1,2,3) index=10 store-liveness-nodes=3
----
ok

Expand Down Expand Up @@ -47,7 +48,9 @@ status 1

# Add the node that will receive a snapshot (it has no state at all, does not
# even have a config).
add-nodes 1
# We set store-liveness-nodes=0 because we already added a third
# store-liveness-node earlier.
add-nodes 1 store-liveness-nodes=0
----
INFO 3 switched to configuration voters=()
INFO 3 became follower at term 0
Expand Down
Loading

0 comments on commit e20bf34

Please sign in to comment.