From e20bf34fecbf0206f25b1f7b4fa1b055180b65b2 Mon Sep 17 00:00:00 2001 From: Ibrahim Kettaneh Date: Mon, 16 Sep 2024 09:11:50 -0400 Subject: [PATCH 1/2] raft: skip fortifying followers with no liveness support epoch This commit makes the leader skip trying to fortify followers which are not supported in store liveness. --- pkg/raft/raft.go | 46 ++++-- .../interaction_env_handler_add_nodes.go | 11 +- pkg/raft/testdata/fortification_basic.txt | 132 +++++++++++++----- .../fortification_support_tracking.txt | 8 -- .../snapshot_succeed_via_app_resp.txt | 7 +- pkg/raft/tracker/fortificationtracker.go | 26 ++++ pkg/raft/tracker/fortificationtracker_test.go | 103 ++++++++++++++ 7 files changed, 279 insertions(+), 54 deletions(-) diff --git a/pkg/raft/raft.go b/pkg/raft/raft.go index 6732e9e069de..c56a688bfa61 100644 --- a/pkg/raft/raft.go +++ b/pkg/raft/raft.go @@ -731,13 +731,43 @@ func (r *raft) sendHeartbeat(to pb.PeerID) { pr.MaybeUpdateSentCommit(commit) } -// sendFortify sends a fortification RPC to the given peer. -func (r *raft) sendFortify(to pb.PeerID) { +// maybeSendFortify sends a fortification RPC to the given peer if it isn't +// fortified but the peer's store supports the leader's store in StoreLiveness. +func (r *raft) maybeSendFortify(id pb.PeerID) { if !r.storeLiveness.SupportFromEnabled() { // The underlying store liveness fabric hasn't been enabled to allow the // leader to request support from peers. No-op. return } + + isFortified, isSupported := r.fortificationTracker.IsFortifiedBy(id) + + if isFortified { + return // return early if the follower's fortified. + } + + if !isSupported { + // If the follower isn't providing active store liveness support to the + // leader, or it is but the leader isn't hearing about it, we don't need to + // send a fortify message. We will attempt to fortify the follower once + // store liveness support is established. + if id == r.id { + // Log if the leader doesn't support itself in the liveness fabric. This + // is possible if the leader is affected by disk stalls. + r.logger.Infof( + "%x leader at term %d does not support itself in the liveness fabric", r.id, r.Term, + ) + } + return + } + + // Only send a fortify message if we don't know that the follower supports us + // at the current epoch. + r.sendFortify(id) +} + +// sendFortify sends a fortification RPC to the given peer. +func (r *raft) sendFortify(to pb.PeerID) { if to == r.id { // We handle the case where the leader is trying to fortify itself specially. // Doing so avoids a self-addressed message. @@ -752,10 +782,6 @@ func (r *raft) sendFortify(to pb.PeerID) { // discrimination for who is providing support (itself vs. other // follower). r.send(pb.Message{To: r.id, Type: pb.MsgFortifyLeaderResp, LeadEpoch: epoch}) - } else { - r.logger.Infof( - "%x leader at term %d does not support itself in the liveness fabric", r.id, r.Term, - ) } return } @@ -787,13 +813,13 @@ func (r *raft) bcastHeartbeat() { }) } -// bcastFortify sends an RPC to fortify the leader to all peers (including the -// leader itself). +// bcastFortify attempts to send an RPC to fortify the leader to all the peers +// (including the leader itself) whose stores are currently providing store +// liveness support to the leader's store but who have not fortified the leader. func (r *raft) bcastFortify() { assertTrue(r.state == StateLeader, "only leaders can fortify") - r.trk.Visit(func(id pb.PeerID, _ *tracker.Progress) { - r.sendFortify(id) + r.maybeSendFortify(id) }) } diff --git a/pkg/raft/rafttest/interaction_env_handler_add_nodes.go b/pkg/raft/rafttest/interaction_env_handler_add_nodes.go index d075c54f3e76..06549aa1f52b 100644 --- a/pkg/raft/rafttest/interaction_env_handler_add_nodes.go +++ b/pkg/raft/rafttest/interaction_env_handler_add_nodes.go @@ -136,7 +136,6 @@ func (env *InteractionEnv) AddNodes(n int, cfg raft.Config, snap pb.Snapshot) er cfg := cfg // fork the config stub cfg.ID, cfg.Storage = id, s - env.Fabric.addNode() cfg.StoreLiveness = newStoreLiveness(env.Fabric, id) // If the node creating command hasn't specified the CRDBVersion, use the @@ -173,5 +172,15 @@ func (env *InteractionEnv) AddNodes(n int, cfg raft.Config, snap pb.Snapshot) er } env.Nodes = append(env.Nodes, node) } + + // The potential store nodes is the max between the number of nodes in the env + // and the sum of voters and learners. Add the difference between the + // potential nodes and the current store nodes. + allPotential := max(len(env.Nodes), + len(snap.Metadata.ConfState.Voters)+len(snap.Metadata.ConfState.Learners)) + curNodesCount := len(env.Fabric.state) - 1 // 1-indexed stores + for rem := allPotential - curNodesCount; rem > 0; rem-- { + env.Fabric.addNode() + } return nil } diff --git a/pkg/raft/testdata/fortification_basic.txt b/pkg/raft/testdata/fortification_basic.txt index 2d1f19467824..d0db28b281ba 100644 --- a/pkg/raft/testdata/fortification_basic.txt +++ b/pkg/raft/testdata/fortification_basic.txt @@ -4,47 +4,45 @@ log-level info ---- ok -add-nodes 3 voters=(1,2,3) index=2 +add-nodes 4 voters=(1,2,3,4) index=2 ---- -INFO 1 switched to configuration voters=(1 2 3) +INFO 1 switched to configuration voters=(1 2 3 4) INFO 1 became follower at term 0 -INFO newRaft 1 [peers: [1,2,3], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1] -INFO 2 switched to configuration voters=(1 2 3) +INFO newRaft 1 [peers: [1,2,3,4], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1] +INFO 2 switched to configuration voters=(1 2 3 4) INFO 2 became follower at term 0 -INFO newRaft 2 [peers: [1,2,3], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1] -INFO 3 switched to configuration voters=(1 2 3) +INFO newRaft 2 [peers: [1,2,3,4], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1] +INFO 3 switched to configuration voters=(1 2 3 4) INFO 3 became follower at term 0 -INFO newRaft 3 [peers: [1,2,3], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1] +INFO newRaft 3 [peers: [1,2,3,4], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1] +INFO 4 switched to configuration voters=(1 2 3 4) +INFO 4 became follower at term 0 +INFO newRaft 4 [peers: [1,2,3,4], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1] # Muck around with StoreLiveness to make it somewhat interesting. bump-epoch 1 ---- - 1 2 3 -1 2 1 1 -2 2 1 1 -3 2 1 1 + 1 2 3 4 +1 2 1 1 1 +2 2 1 1 1 +3 2 1 1 1 +4 2 1 1 1 withdraw-support 1 1 ---- - 1 2 3 -1 x 1 1 -2 2 1 1 -3 2 1 1 + 1 2 3 4 +1 x 1 1 1 +2 2 1 1 1 +3 2 1 1 1 +4 2 1 1 1 grant-support 1 1 ---- - 1 2 3 -1 3 1 1 -2 2 1 1 -3 2 1 1 - -withdraw-support 3 1 ----- - 1 2 3 -1 3 1 1 -2 2 1 1 -3 x 1 1 - + 1 2 3 4 +1 3 1 1 1 +2 2 1 1 1 +3 2 1 1 1 +4 2 1 1 1 campaign 1 ---- @@ -52,8 +50,19 @@ INFO 1 is starting a new election at term 0 INFO 1 became candidate at term 1 INFO 1 [logterm: 1, index: 2] sent MsgVote request to 2 at term 1 INFO 1 [logterm: 1, index: 2] sent MsgVote request to 3 at term 1 +INFO 1 [logterm: 1, index: 2] sent MsgVote request to 4 at term 1 -stabilize +# Node 3 withdraws its support for node 1. +# Node 4 will withdraw support after the fortification message is sent. +withdraw-support 3 1 +---- + 1 2 3 4 +1 3 1 1 1 +2 2 1 1 1 +3 x 1 1 1 +4 2 1 1 1 + +stabilize 1 ---- > 1 handling Ready Ready MustSync=true: @@ -62,8 +71,12 @@ stabilize Messages: 1->2 MsgVote Term:1 Log:1/2 1->3 MsgVote Term:1 Log:1/2 + 1->4 MsgVote Term:1 Log:1/2 INFO 1 received MsgVoteResp from 1 at term 1 INFO 1 has received 1 MsgVoteResp votes and 0 vote rejections + +stabilize 2 3 4 +---- > 2 receiving messages 1->2 MsgVote Term:1 Log:1/2 INFO 2 [term: 0] received a MsgVote message with higher term from 1 [term: 1] @@ -74,6 +87,11 @@ stabilize INFO 3 [term: 0] received a MsgVote message with higher term from 1 [term: 1] INFO 3 became follower at term 1 INFO 3 [logterm: 1, index: 2, vote: 0] cast MsgVote for 1 [logterm: 1, index: 2] at term 1 +> 4 receiving messages + 1->4 MsgVote Term:1 Log:1/2 + INFO 4 [term: 0] received a MsgVote message with higher term from 1 [term: 1] + INFO 4 became follower at term 1 + INFO 4 [logterm: 1, index: 2, vote: 0] cast MsgVote for 1 [logterm: 1, index: 2] at term 1 > 2 handling Ready Ready MustSync=true: HardState Term:1 Vote:1 Commit:2 Lead:0 LeadEpoch:0 @@ -84,12 +102,25 @@ stabilize HardState Term:1 Vote:1 Commit:2 Lead:0 LeadEpoch:0 Messages: 3->1 MsgVoteResp Term:1 Log:0/0 +> 4 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:2 Lead:0 LeadEpoch:0 + Messages: + 4->1 MsgVoteResp Term:1 Log:0/0 + +# Since node 3 withdrew its support, node 1 will not send a MsgFortifyLeader to +# it. +stabilize 1 +---- > 1 receiving messages 2->1 MsgVoteResp Term:1 Log:0/0 INFO 1 received MsgVoteResp from 2 at term 1 INFO 1 has received 2 MsgVoteResp votes and 0 vote rejections - INFO 1 became leader at term 1 3->1 MsgVoteResp Term:1 Log:0/0 + INFO 1 received MsgVoteResp from 3 at term 1 + INFO 1 has received 3 MsgVoteResp votes and 0 vote rejections + INFO 1 became leader at term 1 + 4->1 MsgVoteResp Term:1 Log:0/0 > 1 handling Ready Ready MustSync=true: State:StateLeader @@ -98,15 +129,31 @@ stabilize 1/3 EntryNormal "" Messages: 1->2 MsgFortifyLeader Term:1 Log:0/0 - 1->3 MsgFortifyLeader Term:1 Log:0/0 + 1->4 MsgFortifyLeader Term:1 Log:0/0 1->2 MsgApp Term:1 Log:1/2 Commit:2 Entries:[1/3 EntryNormal ""] 1->3 MsgApp Term:1 Log:1/2 Commit:2 Entries:[1/3 EntryNormal ""] + 1->4 MsgApp Term:1 Log:1/2 Commit:2 Entries:[1/3 EntryNormal ""] + +withdraw-support 4 1 +---- + 1 2 3 4 +1 3 1 1 1 +2 2 1 1 1 +3 x 1 1 1 +4 x 1 1 1 + +# Since node 4 withdrew its support after MsgFortifyLeader is sent, node 4 will +# reject the MsgFortifyLeader message. +stabilize +---- > 2 receiving messages 1->2 MsgFortifyLeader Term:1 Log:0/0 1->2 MsgApp Term:1 Log:1/2 Commit:2 Entries:[1/3 EntryNormal ""] > 3 receiving messages - 1->3 MsgFortifyLeader Term:1 Log:0/0 1->3 MsgApp Term:1 Log:1/2 Commit:2 Entries:[1/3 EntryNormal ""] +> 4 receiving messages + 1->4 MsgFortifyLeader Term:1 Log:0/0 + 1->4 MsgApp Term:1 Log:1/2 Commit:2 Entries:[1/3 EntryNormal ""] > 2 handling Ready Ready MustSync=true: HardState Term:1 Vote:1 Commit:2 Lead:1 LeadEpoch:2 @@ -121,13 +168,21 @@ stabilize Entries: 1/3 EntryNormal "" Messages: - 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 Rejected (Hint: 0) 3->1 MsgAppResp Term:1 Log:0/3 Commit:2 +> 4 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:2 Lead:1 LeadEpoch:0 + Entries: + 1/3 EntryNormal "" + Messages: + 4->1 MsgFortifyLeaderResp Term:1 Log:0/0 Rejected (Hint: 0) + 4->1 MsgAppResp Term:1 Log:0/3 Commit:2 > 1 receiving messages 2->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 2->1 MsgAppResp Term:1 Log:0/3 Commit:2 - 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 Rejected (Hint: 0) 3->1 MsgAppResp Term:1 Log:0/3 Commit:2 + 4->1 MsgFortifyLeaderResp Term:1 Log:0/0 Rejected (Hint: 0) + 4->1 MsgAppResp Term:1 Log:0/3 Commit:2 > 1 handling Ready Ready MustSync=true: HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:3 @@ -136,10 +191,13 @@ stabilize Messages: 1->2 MsgApp Term:1 Log:1/3 Commit:3 1->3 MsgApp Term:1 Log:1/3 Commit:3 + 1->4 MsgApp Term:1 Log:1/3 Commit:3 > 2 receiving messages 1->2 MsgApp Term:1 Log:1/3 Commit:3 > 3 receiving messages 1->3 MsgApp Term:1 Log:1/3 Commit:3 +> 4 receiving messages + 1->4 MsgApp Term:1 Log:1/3 Commit:3 > 2 handling Ready Ready MustSync=true: HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:2 @@ -154,6 +212,14 @@ stabilize 1/3 EntryNormal "" Messages: 3->1 MsgAppResp Term:1 Log:0/3 Commit:3 +> 4 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:0 + CommittedEntries: + 1/3 EntryNormal "" + Messages: + 4->1 MsgAppResp Term:1 Log:0/3 Commit:3 > 1 receiving messages 2->1 MsgAppResp Term:1 Log:0/3 Commit:3 3->1 MsgAppResp Term:1 Log:0/3 Commit:3 + 4->1 MsgAppResp Term:1 Log:0/3 Commit:3 diff --git a/pkg/raft/testdata/fortification_support_tracking.txt b/pkg/raft/testdata/fortification_support_tracking.txt index ccc5e64ea152..e307a729f154 100644 --- a/pkg/raft/testdata/fortification_support_tracking.txt +++ b/pkg/raft/testdata/fortification_support_tracking.txt @@ -81,15 +81,11 @@ stabilize Entries: 1/11 EntryNormal "" Messages: - 1->2 MsgFortifyLeader Term:1 Log:0/0 - 1->3 MsgFortifyLeader Term:1 Log:0/0 1->2 MsgApp Term:1 Log:1/10 Commit:10 Entries:[1/11 EntryNormal ""] 1->3 MsgApp Term:1 Log:1/10 Commit:10 Entries:[1/11 EntryNormal ""] > 2 receiving messages - 1->2 MsgFortifyLeader Term:1 Log:0/0 1->2 MsgApp Term:1 Log:1/10 Commit:10 Entries:[1/11 EntryNormal ""] > 3 receiving messages - 1->3 MsgFortifyLeader Term:1 Log:0/0 1->3 MsgApp Term:1 Log:1/10 Commit:10 Entries:[1/11 EntryNormal ""] > 2 handling Ready Ready MustSync=true: @@ -97,7 +93,6 @@ stabilize Entries: 1/11 EntryNormal "" Messages: - 2->1 MsgFortifyLeaderResp Term:1 Log:0/0 Rejected (Hint: 0) 2->1 MsgAppResp Term:1 Log:0/11 Commit:10 > 3 handling Ready Ready MustSync=true: @@ -105,12 +100,9 @@ stabilize Entries: 1/11 EntryNormal "" Messages: - 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 Rejected (Hint: 0) 3->1 MsgAppResp Term:1 Log:0/11 Commit:10 > 1 receiving messages - 2->1 MsgFortifyLeaderResp Term:1 Log:0/0 Rejected (Hint: 0) 2->1 MsgAppResp Term:1 Log:0/11 Commit:10 - 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 Rejected (Hint: 0) 3->1 MsgAppResp Term:1 Log:0/11 Commit:10 > 1 handling Ready Ready MustSync=true: diff --git a/pkg/raft/testdata/snapshot_succeed_via_app_resp.txt b/pkg/raft/testdata/snapshot_succeed_via_app_resp.txt index 9c96104e3dc6..97a230c1364d 100644 --- a/pkg/raft/testdata/snapshot_succeed_via_app_resp.txt +++ b/pkg/raft/testdata/snapshot_succeed_via_app_resp.txt @@ -12,7 +12,8 @@ log-level none ok # Start with two nodes, but the config already has a third. -add-nodes 2 voters=(1,2,3) index=10 +# We set store-liveness-nodes=3 because we add 3 voters despite having 2 nodes. +add-nodes 2 voters=(1,2,3) index=10 store-liveness-nodes=3 ---- ok @@ -47,7 +48,9 @@ status 1 # Add the node that will receive a snapshot (it has no state at all, does not # even have a config). -add-nodes 1 +# We set store-liveness-nodes=0 because we already added a third +# store-liveness-node earlier. +add-nodes 1 store-liveness-nodes=0 ---- INFO 3 switched to configuration voters=() INFO 3 became follower at term 0 diff --git a/pkg/raft/tracker/fortificationtracker.go b/pkg/raft/tracker/fortificationtracker.go index a05a172185de..3ba2e4da97c7 100644 --- a/pkg/raft/tracker/fortificationtracker.go +++ b/pkg/raft/tracker/fortificationtracker.go @@ -61,6 +61,32 @@ func (st *FortificationTracker) Reset() { // down. } +// IsFortifiedBy returns whether the follower fortifies the leader or not. +// If the follower's store doesn't support the leader's store in the store +// liveness fabric, then both isSupported and isFortified will be false. +// If isFortified is true, it implies that isSupported is also true. +func (st *FortificationTracker) IsFortifiedBy(id pb.PeerID) (isFortified bool, isSupported bool) { + supportEpoch, curExp := st.storeLiveness.SupportFrom(id) + if st.storeLiveness.SupportExpired(curExp) { + return false, false + } + + // At this point we know that the follower's store is providing support + // at the store liveness fabric. + fortificationEpoch, exist := st.fortification[id] + if !exist { + // We don't know that the follower is fortified. + return false, true + } + + // NB: We can't assert that supportEpoch <= fortificationEpoch because there + // may be a race between a successful MsgFortifyLeaderResp and the store + // liveness heartbeat response that lets the leader know the follower's store + // is supporting the leader's store at the epoch in the MsgFortifyLeaderResp + // message. + return fortificationEpoch == supportEpoch, true +} + // LeadSupportUntil returns the timestamp until which the leader is guaranteed // fortification until based on the fortification being tracked for it by its // peers. diff --git a/pkg/raft/tracker/fortificationtracker_test.go b/pkg/raft/tracker/fortificationtracker_test.go index c4d1ff78f988..cdd1fada1f14 100644 --- a/pkg/raft/tracker/fortificationtracker_test.go +++ b/pkg/raft/tracker/fortificationtracker_test.go @@ -143,6 +143,109 @@ func TestLeadSupportUntil(t *testing.T) { } } +func TestIsSupportedBy(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ts := func(ts int64) hlc.Timestamp { + return hlc.Timestamp{ + WallTime: ts, + } + } + + mockLivenessOnePeer := makeMockStoreLiveness( + map[pb.PeerID]mockLivenessEntry{ + 1: makeMockLivenessEntry(10, ts(20)), + }, + ) + + testCases := []struct { + ids []pb.PeerID + storeLiveness raftstoreliveness.StoreLiveness + setup func(tracker *FortificationTracker) + expSupported bool + expFortified bool + }{ + { + ids: []pb.PeerID{1}, + // No support recorded at the store liveness fabric. + storeLiveness: makeMockStoreLiveness(map[pb.PeerID]mockLivenessEntry{}), + setup: func(supportTracker *FortificationTracker) { + // No support recorded. + }, + expSupported: false, + expFortified: false, + }, + { + ids: []pb.PeerID{1}, + storeLiveness: mockLivenessOnePeer, + setup: func(supportTracker *FortificationTracker) { + // No support recorded. + }, + expSupported: true, + expFortified: false, + }, + { + ids: []pb.PeerID{2}, + storeLiveness: mockLivenessOnePeer, + setup: func(supportTracker *FortificationTracker) { + // Support recorded for a different follower than the one in + // storeLiveness. + supportTracker.RecordFortification(2, 10) + }, + expSupported: true, + expFortified: false, + }, + { + ids: []pb.PeerID{1}, + storeLiveness: mockLivenessOnePeer, + setup: func(supportTracker *FortificationTracker) { + // Support recorded for an expired epoch. + supportTracker.RecordFortification(1, 9) + }, + expSupported: true, + expFortified: false, + }, + { + ids: []pb.PeerID{1}, + storeLiveness: mockLivenessOnePeer, + setup: func(supportTracker *FortificationTracker) { + // Record support at newer epochs than what are present in + // StoreLiveness. + // + // NB: This is possible if there is a race between store liveness + // heartbeats updates and fortification responses. + supportTracker.RecordFortification(1, 11) + }, + expSupported: true, + expFortified: false, + }, + { + ids: []pb.PeerID{1}, + storeLiveness: mockLivenessOnePeer, + setup: func(supportTracker *FortificationTracker) { + // Record support at the same epoch as the storeLiveness. + supportTracker.RecordFortification(1, 10) + }, + expSupported: true, + expFortified: true, + }, + } + + for _, tc := range testCases { + cfg := quorum.MakeEmptyConfig() + for _, id := range tc.ids { + cfg.Voters[0][id] = struct{}{} + } + supportTracker := MakeFortificationTracker(&cfg, tc.storeLiveness) + + tc.setup(&supportTracker) + isFortified, isSupported := supportTracker.IsFortifiedBy(1) + require.Equal(t, tc.expSupported, isSupported) + require.Equal(t, tc.expFortified, isFortified) + } +} + // TestQuorumActive ensures that we correctly determine whether a leader's // quorum is active or not. func TestQuorumActive(t *testing.T) { From edc3277f048437e4c31387c68e4e4d8f53078f54 Mon Sep 17 00:00:00 2001 From: Ibrahim Kettaneh Date: Mon, 16 Sep 2024 09:27:03 -0400 Subject: [PATCH 2/2] raft: refortify followers if they are not fortified This commit makes the leader periodically attempt to fortify unfortified followers. Before this change, the leader used to send a fortification message to followers, but doesn't try to refortify unfortified followers. --- pkg/raft/BUILD.bazel | 1 + pkg/raft/raft.go | 6 +- pkg/raft/raft_paper_test.go | 52 +++- pkg/raft/raft_test.go | 5 + .../async_storage_writes_append_aba_race.txt | 19 +- pkg/raft/testdata/checkquorum.txt | 48 ++- pkg/raft/testdata/refortification_basic.txt | 287 ++++++++++++++++++ .../snapshot_succeed_via_app_resp.txt | 10 +- 8 files changed, 403 insertions(+), 25 deletions(-) create mode 100644 pkg/raft/testdata/refortification_basic.txt diff --git a/pkg/raft/BUILD.bazel b/pkg/raft/BUILD.bazel index 4061909348c0..1761144ae87a 100644 --- a/pkg/raft/BUILD.bazel +++ b/pkg/raft/BUILD.bazel @@ -58,6 +58,7 @@ go_test( "//pkg/raft/rafttest", "//pkg/raft/tracker", "//pkg/settings/cluster", + "//pkg/testutils", "@com_github_cockroachdb_datadriven//:datadriven", "@com_github_stretchr_testify//assert", "@com_github_stretchr_testify//require", diff --git a/pkg/raft/raft.go b/pkg/raft/raft.go index c56a688bfa61..ccba05e6ad12 100644 --- a/pkg/raft/raft.go +++ b/pkg/raft/raft.go @@ -743,7 +743,7 @@ func (r *raft) maybeSendFortify(id pb.PeerID) { isFortified, isSupported := r.fortificationTracker.IsFortifiedBy(id) if isFortified { - return // return early if the follower's fortified. + return // return early if the follower's fortified } if !isSupported { @@ -1038,6 +1038,10 @@ func (r *raft) tickHeartbeat() { if err := r.Step(pb.Message{From: r.id, Type: pb.MsgBeat}); err != nil { r.logger.Debugf("error occurred during checking sending heartbeat: %v", err) } + + // Try to refortify any followers that don't currently support us. + r.bcastFortify() + // TODO(ibrahim): add/call maybeUnpauseAndBcastAppend() here. } } diff --git a/pkg/raft/raft_paper_test.go b/pkg/raft/raft_paper_test.go index 745dd193fd71..dcda012571e9 100644 --- a/pkg/raft/raft_paper_test.go +++ b/pkg/raft/raft_paper_test.go @@ -35,6 +35,7 @@ import ( "testing" pb "github.com/cockroachdb/cockroach/pkg/raft/raftpb" + "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -105,23 +106,44 @@ func TestStartAsFollower(t *testing.T) { func TestLeaderBcastBeat(t *testing.T) { // heartbeat interval hi := 1 - r := newTestRaft(1, 10, hi, newTestMemoryStorage(withPeers(1, 2, 3))) - r.becomeCandidate() - r.becomeLeader() - for i := 0; i < 10; i++ { - mustAppendEntry(r, pb.Entry{Index: uint64(i) + 1}) - } - for i := 0; i < hi; i++ { - r.tick() - } + testutils.RunTrueAndFalse(t, "store-liveness-enabled", + func(t *testing.T, storeLivenessEnabled bool) { + testOptions := emptyTestConfigModifierOpt() + if !storeLivenessEnabled { + testOptions = withFortificationDisabled() + } - msgs := r.readMessages() - sort.Sort(messageSlice(msgs)) - assert.Equal(t, []pb.Message{ - {From: 1, To: 2, Term: 1, Type: pb.MsgHeartbeat}, - {From: 1, To: 3, Term: 1, Type: pb.MsgHeartbeat}, - }, msgs) + r := newTestRaft(1, 10, hi, + newTestMemoryStorage(withPeers(1, 2, 3)), testOptions) + + r.becomeCandidate() + r.becomeLeader() + + for i := 0; i < 10; i++ { + mustAppendEntry(r, pb.Entry{Index: uint64(i) + 1}) + } + + for i := 0; i < hi; i++ { + r.tick() + } + + msgs := r.readMessages() + sort.Sort(messageSlice(msgs)) + if storeLivenessEnabled { + assert.Equal(t, []pb.Message{ + {From: 1, To: 2, Term: 1, Type: pb.MsgFortifyLeader}, + {From: 1, To: 3, Term: 1, Type: pb.MsgFortifyLeader}, + {From: 1, To: 2, Term: 1, Type: pb.MsgHeartbeat}, + {From: 1, To: 3, Term: 1, Type: pb.MsgHeartbeat}, + }, msgs) + } else { + assert.Equal(t, []pb.Message{ + {From: 1, To: 2, Term: 1, Type: pb.MsgHeartbeat}, + {From: 1, To: 3, Term: 1, Type: pb.MsgHeartbeat}, + }, msgs) + } + }) } func TestFollowerStartElection(t *testing.T) { diff --git a/pkg/raft/raft_test.go b/pkg/raft/raft_test.go index a77e5168a084..5d07f0fb4627 100644 --- a/pkg/raft/raft_test.go +++ b/pkg/raft/raft_test.go @@ -4112,6 +4112,11 @@ type testConfigModifiers struct { // that may be used to modify the config. type testConfigModifierOpt func(*testConfigModifiers) +// emptyTestConfigModifierOpt returns an empty testConfigModifierOpt. +func emptyTestConfigModifierOpt() testConfigModifierOpt { + return func(modifier *testConfigModifiers) {} +} + // withRaftFortification disables raft fortification. func withFortificationDisabled() testConfigModifierOpt { return func(modifier *testConfigModifiers) { diff --git a/pkg/raft/testdata/async_storage_writes_append_aba_race.txt b/pkg/raft/testdata/async_storage_writes_append_aba_race.txt index c1f6e0a6ce79..32e7cba194ad 100644 --- a/pkg/raft/testdata/async_storage_writes_append_aba_race.txt +++ b/pkg/raft/testdata/async_storage_writes_append_aba_race.txt @@ -414,20 +414,32 @@ Messages: 4->5 MsgHeartbeat Term:3 Log:0/0 4->6 MsgHeartbeat Term:3 Log:0/0 4->7 MsgHeartbeat Term:3 Log:0/0 +4->1 MsgFortifyLeader Term:3 Log:0/0 +4->2 MsgFortifyLeader Term:3 Log:0/0 +4->3 MsgFortifyLeader Term:3 Log:0/0 +4->5 MsgFortifyLeader Term:3 Log:0/0 +4->6 MsgFortifyLeader Term:3 Log:0/0 +4->7 MsgFortifyLeader Term:3 Log:0/0 +4->AppendThread MsgStorageAppend Term:0 Log:0/0 Responses:[ + 4->4 MsgFortifyLeaderResp Term:3 Log:0/0 LeadEpoch:1 +] deliver-msgs 1 ---- 4->1 MsgHeartbeat Term:3 Log:0/0 INFO 1 [term: 2] received a MsgHeartbeat message with higher term from 4 [term: 3] INFO 1 became follower at term 3 +4->1 MsgFortifyLeader Term:3 Log:0/0 process-ready 1 ---- Ready MustSync=true: -HardState Term:3 Commit:11 Lead:4 LeadEpoch:0 +HardState Term:3 Commit:11 Lead:4 LeadEpoch:1 Messages: 1->4 MsgHeartbeatResp Term:3 Log:0/0 -1->AppendThread MsgStorageAppend Term:3 Log:0/0 Commit:11 Lead:4 +1->AppendThread MsgStorageAppend Term:3 Log:0/0 Commit:11 Lead:4 LeadEpoch:1 Responses:[ + 1->4 MsgFortifyLeaderResp Term:3 Log:0/0 LeadEpoch:1 +] deliver-msgs 4 ---- @@ -513,8 +525,9 @@ INFO mark (term,index)=(2,12) mismatched the last accepted term 3 in unstable lo process-append-thread 1 ---- Processing: -1->AppendThread MsgStorageAppend Term:3 Log:0/0 Commit:11 Lead:4 +1->AppendThread MsgStorageAppend Term:3 Log:0/0 Commit:11 Lead:4 LeadEpoch:1 Responses: +1->4 MsgFortifyLeaderResp Term:3 Log:0/0 LeadEpoch:1 raft-log 1 ---- diff --git a/pkg/raft/testdata/checkquorum.txt b/pkg/raft/testdata/checkquorum.txt index f642ce72ba47..4ac9dd920303 100644 --- a/pkg/raft/testdata/checkquorum.txt +++ b/pkg/raft/testdata/checkquorum.txt @@ -73,31 +73,57 @@ INFO 1 became follower at term 1 stabilize ---- > 1 handling Ready - Ready MustSync=false: + Ready MustSync=true: State:StateFollower + HardState Term:1 Vote:1 Commit:11 Lead:1 LeadEpoch:2 Messages: 1->2 MsgHeartbeat Term:1 Log:0/0 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->2 MsgFortifyLeader Term:1 Log:0/0 + 1->3 MsgFortifyLeader Term:1 Log:0/0 1->2 MsgHeartbeat Term:1 Log:0/0 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->2 MsgFortifyLeader Term:1 Log:0/0 + 1->3 MsgFortifyLeader Term:1 Log:0/0 1->2 MsgHeartbeat Term:1 Log:0/0 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->2 MsgFortifyLeader Term:1 Log:0/0 + 1->3 MsgFortifyLeader Term:1 Log:0/0 1->2 MsgHeartbeat Term:1 Log:0/0 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->2 MsgFortifyLeader Term:1 Log:0/0 + 1->3 MsgFortifyLeader Term:1 Log:0/0 1->2 MsgHeartbeat Term:1 Log:0/0 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->2 MsgFortifyLeader Term:1 Log:0/0 + 1->3 MsgFortifyLeader Term:1 Log:0/0 > 2 receiving messages 1->2 MsgHeartbeat Term:1 Log:0/0 + 1->2 MsgFortifyLeader Term:1 Log:0/0 + INFO 2 [term: 2] ignored a MsgFortifyLeader message with lower term from 1 [term: 1] 1->2 MsgHeartbeat Term:1 Log:0/0 + 1->2 MsgFortifyLeader Term:1 Log:0/0 + INFO 2 [term: 2] ignored a MsgFortifyLeader message with lower term from 1 [term: 1] 1->2 MsgHeartbeat Term:1 Log:0/0 + 1->2 MsgFortifyLeader Term:1 Log:0/0 + INFO 2 [term: 2] ignored a MsgFortifyLeader message with lower term from 1 [term: 1] 1->2 MsgHeartbeat Term:1 Log:0/0 + 1->2 MsgFortifyLeader Term:1 Log:0/0 + INFO 2 [term: 2] ignored a MsgFortifyLeader message with lower term from 1 [term: 1] 1->2 MsgHeartbeat Term:1 Log:0/0 + 1->2 MsgFortifyLeader Term:1 Log:0/0 + INFO 2 [term: 2] ignored a MsgFortifyLeader message with lower term from 1 [term: 1] > 3 receiving messages 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->3 MsgFortifyLeader Term:1 Log:0/0 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->3 MsgFortifyLeader Term:1 Log:0/0 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->3 MsgFortifyLeader Term:1 Log:0/0 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->3 MsgFortifyLeader Term:1 Log:0/0 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->3 MsgFortifyLeader Term:1 Log:0/0 > 2 handling Ready Ready MustSync=false: Messages: @@ -107,13 +133,19 @@ stabilize 2->1 MsgAppResp Term:2 Log:0/0 2->1 MsgAppResp Term:2 Log:0/0 > 3 handling Ready - Ready MustSync=false: + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:11 Lead:1 LeadEpoch:2 Messages: 3->1 MsgHeartbeatResp Term:1 Log:0/0 3->1 MsgHeartbeatResp Term:1 Log:0/0 3->1 MsgHeartbeatResp Term:1 Log:0/0 3->1 MsgHeartbeatResp Term:1 Log:0/0 3->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 > 1 receiving messages 2->1 MsgAppResp Term:2 Log:0/0 INFO 1 [term: 1] received a MsgAppResp message with higher term from 2 [term: 2] @@ -132,6 +164,16 @@ stabilize INFO 1 [term: 2] ignored a MsgHeartbeatResp message with lower term from 3 [term: 1] 3->1 MsgHeartbeatResp Term:1 Log:0/0 INFO 1 [term: 2] ignored a MsgHeartbeatResp message with lower term from 3 [term: 1] + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 + INFO 1 [term: 2] ignored a MsgFortifyLeaderResp message with lower term from 3 [term: 1] + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 + INFO 1 [term: 2] ignored a MsgFortifyLeaderResp message with lower term from 3 [term: 1] + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 + INFO 1 [term: 2] ignored a MsgFortifyLeaderResp message with lower term from 3 [term: 1] + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 + INFO 1 [term: 2] ignored a MsgFortifyLeaderResp message with lower term from 3 [term: 1] + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 + INFO 1 [term: 2] ignored a MsgFortifyLeaderResp message with lower term from 3 [term: 1] > 1 handling Ready Ready MustSync=true: HardState Term:2 Commit:11 Lead:0 LeadEpoch:0 @@ -165,7 +207,7 @@ INFO 1 [logterm: 1, index: 11, vote: 0] cast MsgVote for 2 [logterm: 1, index: 1 deliver-msgs 3 ---- 2->3 MsgVote Term:3 Log:1/11 -INFO 3 [logterm: 1, index: 11, vote: 1] ignored MsgVote from 2 [logterm: 1, index: 11] at term 1: recently received communication from leader (remaining ticks: 3) +INFO 3 [logterm: 1, index: 11, vote: 1] ignored MsgVote from 2 [logterm: 1, index: 11] at term 1: recently received communication from leader (remaining ticks: 3) and supporting fortified leader 1 at epoch 2 stabilize ---- diff --git a/pkg/raft/testdata/refortification_basic.txt b/pkg/raft/testdata/refortification_basic.txt new file mode 100644 index 000000000000..56369ceff4db --- /dev/null +++ b/pkg/raft/testdata/refortification_basic.txt @@ -0,0 +1,287 @@ +# Basic tests for leader refortification. + +log-level none +---- +ok + +add-nodes 3 voters=(1,2,3) index=2 +---- +ok + +log-level info +---- +ok + +# Muck around with StoreLiveness to make it somewhat interesting. +bump-epoch 1 +---- + 1 2 3 +1 2 1 1 +2 2 1 1 +3 2 1 1 + +withdraw-support 1 1 +---- + 1 2 3 +1 x 1 1 +2 2 1 1 +3 2 1 1 + +grant-support 1 1 +---- + 1 2 3 +1 3 1 1 +2 2 1 1 +3 2 1 1 + +withdraw-support 3 1 +---- + 1 2 3 +1 3 1 1 +2 2 1 1 +3 x 1 1 + +campaign 1 +---- +INFO 1 is starting a new election at term 0 +INFO 1 became candidate at term 1 +INFO 1 [logterm: 1, index: 2] sent MsgVote request to 2 at term 1 +INFO 1 [logterm: 1, index: 2] sent MsgVote request to 3 at term 1 + +stabilize +---- +> 1 handling Ready + Ready MustSync=true: + State:StateCandidate + HardState Term:1 Vote:1 Commit:2 Lead:0 LeadEpoch:0 + Messages: + 1->2 MsgVote Term:1 Log:1/2 + 1->3 MsgVote Term:1 Log:1/2 + INFO 1 received MsgVoteResp from 1 at term 1 + INFO 1 has received 1 MsgVoteResp votes and 0 vote rejections +> 2 receiving messages + 1->2 MsgVote Term:1 Log:1/2 + INFO 2 [term: 0] received a MsgVote message with higher term from 1 [term: 1] + INFO 2 became follower at term 1 + INFO 2 [logterm: 1, index: 2, vote: 0] cast MsgVote for 1 [logterm: 1, index: 2] at term 1 +> 3 receiving messages + 1->3 MsgVote Term:1 Log:1/2 + INFO 3 [term: 0] received a MsgVote message with higher term from 1 [term: 1] + INFO 3 became follower at term 1 + INFO 3 [logterm: 1, index: 2, vote: 0] cast MsgVote for 1 [logterm: 1, index: 2] at term 1 +> 2 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:2 Lead:0 LeadEpoch:0 + Messages: + 2->1 MsgVoteResp Term:1 Log:0/0 +> 3 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:2 Lead:0 LeadEpoch:0 + Messages: + 3->1 MsgVoteResp Term:1 Log:0/0 +> 1 receiving messages + 2->1 MsgVoteResp Term:1 Log:0/0 + INFO 1 received MsgVoteResp from 2 at term 1 + INFO 1 has received 2 MsgVoteResp votes and 0 vote rejections + INFO 1 became leader at term 1 + 3->1 MsgVoteResp Term:1 Log:0/0 +> 1 handling Ready + Ready MustSync=true: + State:StateLeader + HardState Term:1 Vote:1 Commit:2 Lead:1 LeadEpoch:3 + Entries: + 1/3 EntryNormal "" + Messages: + 1->2 MsgFortifyLeader Term:1 Log:0/0 + 1->2 MsgApp Term:1 Log:1/2 Commit:2 Entries:[1/3 EntryNormal ""] + 1->3 MsgApp Term:1 Log:1/2 Commit:2 Entries:[1/3 EntryNormal ""] +> 2 receiving messages + 1->2 MsgFortifyLeader Term:1 Log:0/0 + 1->2 MsgApp Term:1 Log:1/2 Commit:2 Entries:[1/3 EntryNormal ""] +> 3 receiving messages + 1->3 MsgApp Term:1 Log:1/2 Commit:2 Entries:[1/3 EntryNormal ""] +> 2 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:2 Lead:1 LeadEpoch:2 + Entries: + 1/3 EntryNormal "" + Messages: + 2->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 + 2->1 MsgAppResp Term:1 Log:0/3 Commit:2 +> 3 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:2 Lead:1 LeadEpoch:0 + Entries: + 1/3 EntryNormal "" + Messages: + 3->1 MsgAppResp Term:1 Log:0/3 Commit:2 +> 1 receiving messages + 2->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2 + 2->1 MsgAppResp Term:1 Log:0/3 Commit:2 + 3->1 MsgAppResp Term:1 Log:0/3 Commit:2 +> 1 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:3 + CommittedEntries: + 1/3 EntryNormal "" + Messages: + 1->2 MsgApp Term:1 Log:1/3 Commit:3 + 1->3 MsgApp Term:1 Log:1/3 Commit:3 +> 2 receiving messages + 1->2 MsgApp Term:1 Log:1/3 Commit:3 +> 3 receiving messages + 1->3 MsgApp Term:1 Log:1/3 Commit:3 +> 2 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:2 + CommittedEntries: + 1/3 EntryNormal "" + Messages: + 2->1 MsgAppResp Term:1 Log:0/3 Commit:3 +> 3 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:0 + CommittedEntries: + 1/3 EntryNormal "" + Messages: + 3->1 MsgAppResp Term:1 Log:0/3 Commit:3 +> 1 receiving messages + 2->1 MsgAppResp Term:1 Log:0/3 Commit:3 + 3->1 MsgAppResp Term:1 Log:0/3 Commit:3 + +# On the next heartbeat, the leader still won't send a MsgFortifyLeader to +# follower 3 because it doesn't support it in the store liveness fabric. +tick-heartbeat 1 +---- +ok + +stabilize 1 +---- +> 1 handling Ready + Ready MustSync=false: + Messages: + 1->2 MsgHeartbeat Term:1 Log:0/0 + 1->3 MsgHeartbeat Term:1 Log:0/0 + +grant-support 3 1 +---- + 1 2 3 +1 3 1 1 +2 2 1 1 +3 3 1 1 + +# Now that follower 3 supports the leader in the store liveness fabric, the +# leader will try to fortify it on the next heartbeat. +tick-heartbeat 1 +---- +ok + +stabilize +---- +> 1 handling Ready + Ready MustSync=false: + Messages: + 1->2 MsgHeartbeat Term:1 Log:0/0 + 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->3 MsgFortifyLeader Term:1 Log:0/0 +> 2 receiving messages + 1->2 MsgHeartbeat Term:1 Log:0/0 + 1->2 MsgHeartbeat Term:1 Log:0/0 +> 3 receiving messages + 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->3 MsgFortifyLeader Term:1 Log:0/0 +> 2 handling Ready + Ready MustSync=false: + Messages: + 2->1 MsgHeartbeatResp Term:1 Log:0/0 + 2->1 MsgHeartbeatResp Term:1 Log:0/0 +> 3 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:3 + Messages: + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:3 +> 1 receiving messages + 2->1 MsgHeartbeatResp Term:1 Log:0/0 + 2->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:3 + +# If the follower supports the leader at an older epoch, the leader will try +# to refortify it on the next heartbeat timeout. +bump-epoch 1 +---- + 1 2 3 +1 4 1 1 +2 3 1 1 +3 4 1 1 + +tick-heartbeat 1 +---- +ok + +stabilize +---- +> 1 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:4 + Messages: + 1->2 MsgHeartbeat Term:1 Log:0/0 + 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->2 MsgFortifyLeader Term:1 Log:0/0 + 1->3 MsgFortifyLeader Term:1 Log:0/0 +> 2 receiving messages + 1->2 MsgHeartbeat Term:1 Log:0/0 + 1->2 MsgFortifyLeader Term:1 Log:0/0 +> 3 receiving messages + 1->3 MsgHeartbeat Term:1 Log:0/0 + 1->3 MsgFortifyLeader Term:1 Log:0/0 +> 2 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:3 + Messages: + 2->1 MsgHeartbeatResp Term:1 Log:0/0 + 2->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:3 +> 3 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:3 Lead:1 LeadEpoch:4 + Messages: + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:4 +> 1 receiving messages + 2->1 MsgHeartbeatResp Term:1 Log:0/0 + 2->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:3 + 3->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:4 + +# If no follower needs to be refortified, the leader won't send any +# fortification messages. +tick-heartbeat 1 +---- +ok + +stabilize +---- +> 1 handling Ready + Ready MustSync=false: + Messages: + 1->2 MsgHeartbeat Term:1 Log:0/0 + 1->3 MsgHeartbeat Term:1 Log:0/0 +> 2 receiving messages + 1->2 MsgHeartbeat Term:1 Log:0/0 +> 3 receiving messages + 1->3 MsgHeartbeat Term:1 Log:0/0 +> 2 handling Ready + Ready MustSync=false: + Messages: + 2->1 MsgHeartbeatResp Term:1 Log:0/0 +> 3 handling Ready + Ready MustSync=false: + Messages: + 3->1 MsgHeartbeatResp Term:1 Log:0/0 +> 1 receiving messages + 2->1 MsgHeartbeatResp Term:1 Log:0/0 + 3->1 MsgHeartbeatResp Term:1 Log:0/0 diff --git a/pkg/raft/testdata/snapshot_succeed_via_app_resp.txt b/pkg/raft/testdata/snapshot_succeed_via_app_resp.txt index 97a230c1364d..dc8501658067 100644 --- a/pkg/raft/testdata/snapshot_succeed_via_app_resp.txt +++ b/pkg/raft/testdata/snapshot_succeed_via_app_resp.txt @@ -68,6 +68,7 @@ Ready MustSync=false: Messages: 1->2 MsgHeartbeat Term:1 Log:0/0 1->3 MsgHeartbeat Term:1 Log:0/0 +1->3 MsgFortifyLeader Term:1 Log:0/0 # Iterate until no more work is done by the new peer. It receives the heartbeat # and responds. @@ -77,12 +78,14 @@ stabilize 3 1->3 MsgHeartbeat Term:1 Log:0/0 INFO 3 [term: 0] received a MsgHeartbeat message with higher term from 1 [term: 1] INFO 3 became follower at term 1 + 1->3 MsgFortifyLeader Term:1 Log:0/0 > 3 handling Ready Ready MustSync=true: - HardState Term:1 Commit:0 Lead:1 LeadEpoch:0 + HardState Term:1 Commit:0 Lead:1 LeadEpoch:1 Messages: 3->1 MsgHeartbeatResp Term:1 Log:0/0 - + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:1 + # The leader in turn will realize that n3 needs a snapshot, which it initiates. stabilize 1 ---- @@ -90,6 +93,7 @@ stabilize 1 3->1 MsgHeartbeatResp Term:1 Log:0/0 DEBUG 1 [firstindex: 12, commit: 11] sent snapshot[index: 11, term: 1] to 3 [StateProbe match=0 next=11 sentCommit=10 matchCommit=0] DEBUG 1 paused sending replication messages to 3 [StateSnapshot match=0 next=12 sentCommit=11 matchCommit=0 paused pendingSnap=11] + 3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:1 > 1 handling Ready Ready MustSync=false: Messages: @@ -117,7 +121,7 @@ stabilize 3 INFO 3 [commit: 11] restored snapshot [index: 11, term: 1] > 3 handling Ready Ready MustSync=true: - HardState Term:1 Commit:11 Lead:1 LeadEpoch:0 + HardState Term:1 Commit:11 Lead:1 LeadEpoch:1 Snapshot Index:11 Term:1 ConfState:Voters:[1 2 3] VotersOutgoing:[] Learners:[] LearnersNext:[] AutoLeave:false Messages: 3->1 MsgAppResp Term:1 Log:0/11 Commit:11