Skip to content

Commit

Permalink
raft: refortify followers if they are not fortified
Browse files Browse the repository at this point in the history
This commit makes the leader periodically attempt to fortify unfortified
followers.

Before this change, the leader used to send a fortification message to
followers, but doesn't try to refortify unfortified followers.
  • Loading branch information
iskettaneh committed Sep 24, 2024
1 parent dd81ae4 commit 648076e
Show file tree
Hide file tree
Showing 8 changed files with 403 additions and 24 deletions.
1 change: 1 addition & 0 deletions pkg/raft/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ go_test(
"//pkg/raft/rafttest",
"//pkg/raft/tracker",
"//pkg/settings/cluster",
"//pkg/testutils",
"@com_github_cockroachdb_datadriven//:datadriven",
"@com_github_stretchr_testify//assert",
"@com_github_stretchr_testify//require",
Expand Down
5 changes: 5 additions & 0 deletions pkg/raft/raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -1006,6 +1006,11 @@ func (r *raft) tickHeartbeat() {
if err := r.Step(pb.Message{From: r.id, Type: pb.MsgBeat}); err != nil {
r.logger.Debugf("error occurred during checking sending heartbeat: %v", err)
}

// Try to refortify any followers that don't currently support us.
r.bcastFortify()
// TODO(ibrahim): add/call maybeUnpauseAndBcastAppend() here.

}
}

Expand Down
52 changes: 37 additions & 15 deletions pkg/raft/raft_paper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import (
"testing"

pb "github.com/cockroachdb/cockroach/pkg/raft/raftpb"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
Expand Down Expand Up @@ -105,23 +106,44 @@ func TestStartAsFollower(t *testing.T) {
func TestLeaderBcastBeat(t *testing.T) {
// heartbeat interval
hi := 1
r := newTestRaft(1, 10, hi, newTestMemoryStorage(withPeers(1, 2, 3)))
r.becomeCandidate()
r.becomeLeader()
for i := 0; i < 10; i++ {
mustAppendEntry(r, pb.Entry{Index: uint64(i) + 1})
}

for i := 0; i < hi; i++ {
r.tick()
}
testutils.RunTrueAndFalse(t, "store-liveness-enabled",
func(t *testing.T, storeLivenessEnabled bool) {
testOptions := emptyTestConfigModifierOpt()
if !storeLivenessEnabled {
testOptions = withFortificationDisabled()
}

msgs := r.readMessages()
sort.Sort(messageSlice(msgs))
assert.Equal(t, []pb.Message{
{From: 1, To: 2, Term: 1, Type: pb.MsgHeartbeat},
{From: 1, To: 3, Term: 1, Type: pb.MsgHeartbeat},
}, msgs)
r := newTestRaft(1, 10, hi,
newTestMemoryStorage(withPeers(1, 2, 3)), testOptions)

r.becomeCandidate()
r.becomeLeader()

for i := 0; i < 10; i++ {
mustAppendEntry(r, pb.Entry{Index: uint64(i) + 1})
}

for i := 0; i < hi; i++ {
r.tick()
}

msgs := r.readMessages()
sort.Sort(messageSlice(msgs))
if storeLivenessEnabled {
assert.Equal(t, []pb.Message{
{From: 1, To: 2, Term: 1, Type: pb.MsgFortifyLeader},
{From: 1, To: 3, Term: 1, Type: pb.MsgFortifyLeader},
{From: 1, To: 2, Term: 1, Type: pb.MsgHeartbeat},
{From: 1, To: 3, Term: 1, Type: pb.MsgHeartbeat},
}, msgs)
} else {
assert.Equal(t, []pb.Message{
{From: 1, To: 2, Term: 1, Type: pb.MsgHeartbeat},
{From: 1, To: 3, Term: 1, Type: pb.MsgHeartbeat},
}, msgs)
}
})
}

func TestFollowerStartElection(t *testing.T) {
Expand Down
5 changes: 5 additions & 0 deletions pkg/raft/raft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4112,6 +4112,11 @@ type testConfigModifiers struct {
// that may be used to modify the config.
type testConfigModifierOpt func(*testConfigModifiers)

// emptyTestConfigModifierOpt returns an empty testConfigModifierOpt.
func emptyTestConfigModifierOpt() testConfigModifierOpt {
return func(modifier *testConfigModifiers) {}
}

// withRaftFortification disables raft fortification.
func withFortificationDisabled() testConfigModifierOpt {
return func(modifier *testConfigModifiers) {
Expand Down
19 changes: 16 additions & 3 deletions pkg/raft/testdata/async_storage_writes_append_aba_race.txt
Original file line number Diff line number Diff line change
Expand Up @@ -414,20 +414,32 @@ Messages:
4->5 MsgHeartbeat Term:3 Log:0/0
4->6 MsgHeartbeat Term:3 Log:0/0
4->7 MsgHeartbeat Term:3 Log:0/0
4->1 MsgFortifyLeader Term:3 Log:0/0
4->2 MsgFortifyLeader Term:3 Log:0/0
4->3 MsgFortifyLeader Term:3 Log:0/0
4->5 MsgFortifyLeader Term:3 Log:0/0
4->6 MsgFortifyLeader Term:3 Log:0/0
4->7 MsgFortifyLeader Term:3 Log:0/0
4->AppendThread MsgStorageAppend Term:0 Log:0/0 Responses:[
4->4 MsgFortifyLeaderResp Term:3 Log:0/0 LeadEpoch:1
]

deliver-msgs 1
----
4->1 MsgHeartbeat Term:3 Log:0/0
INFO 1 [term: 2] received a MsgHeartbeat message with higher term from 4 [term: 3]
INFO 1 became follower at term 3
4->1 MsgFortifyLeader Term:3 Log:0/0

process-ready 1
----
Ready MustSync=true:
HardState Term:3 Commit:11 Lead:4 LeadEpoch:0
HardState Term:3 Commit:11 Lead:4 LeadEpoch:1
Messages:
1->4 MsgHeartbeatResp Term:3 Log:0/0
1->AppendThread MsgStorageAppend Term:3 Log:0/0 Commit:11 Lead:4
1->AppendThread MsgStorageAppend Term:3 Log:0/0 Commit:11 Lead:4 LeadEpoch:1 Responses:[
1->4 MsgFortifyLeaderResp Term:3 Log:0/0 LeadEpoch:1
]

deliver-msgs 4
----
Expand Down Expand Up @@ -513,8 +525,9 @@ INFO mark (term,index)=(2,12) mismatched the last accepted term 3 in unstable lo
process-append-thread 1
----
Processing:
1->AppendThread MsgStorageAppend Term:3 Log:0/0 Commit:11 Lead:4
1->AppendThread MsgStorageAppend Term:3 Log:0/0 Commit:11 Lead:4 LeadEpoch:1
Responses:
1->4 MsgFortifyLeaderResp Term:3 Log:0/0 LeadEpoch:1

raft-log 1
----
Expand Down
48 changes: 45 additions & 3 deletions pkg/raft/testdata/checkquorum.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,31 +73,57 @@ INFO 1 became follower at term 1
stabilize
----
> 1 handling Ready
Ready MustSync=false:
Ready MustSync=true:
State:StateFollower
HardState Term:1 Vote:1 Commit:11 Lead:1 LeadEpoch:2
Messages:
1->2 MsgHeartbeat Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->2 MsgHeartbeat Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->2 MsgHeartbeat Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->2 MsgHeartbeat Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->2 MsgHeartbeat Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
> 2 receiving messages
1->2 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
INFO 2 [term: 2] ignored a MsgFortifyLeader message with lower term from 1 [term: 1]
1->2 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
INFO 2 [term: 2] ignored a MsgFortifyLeader message with lower term from 1 [term: 1]
1->2 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
INFO 2 [term: 2] ignored a MsgFortifyLeader message with lower term from 1 [term: 1]
1->2 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
INFO 2 [term: 2] ignored a MsgFortifyLeader message with lower term from 1 [term: 1]
1->2 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
INFO 2 [term: 2] ignored a MsgFortifyLeader message with lower term from 1 [term: 1]
> 3 receiving messages
1->3 MsgHeartbeat Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
> 2 handling Ready
Ready MustSync=false:
Messages:
Expand All @@ -107,13 +133,19 @@ stabilize
2->1 MsgAppResp Term:2 Log:0/0
2->1 MsgAppResp Term:2 Log:0/0
> 3 handling Ready
Ready MustSync=false:
Ready MustSync=true:
HardState Term:1 Vote:1 Commit:11 Lead:1 LeadEpoch:2
Messages:
3->1 MsgHeartbeatResp Term:1 Log:0/0
3->1 MsgHeartbeatResp Term:1 Log:0/0
3->1 MsgHeartbeatResp Term:1 Log:0/0
3->1 MsgHeartbeatResp Term:1 Log:0/0
3->1 MsgHeartbeatResp Term:1 Log:0/0
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
> 1 receiving messages
2->1 MsgAppResp Term:2 Log:0/0
INFO 1 [term: 1] received a MsgAppResp message with higher term from 2 [term: 2]
Expand All @@ -132,6 +164,16 @@ stabilize
INFO 1 [term: 2] ignored a MsgHeartbeatResp message with lower term from 3 [term: 1]
3->1 MsgHeartbeatResp Term:1 Log:0/0
INFO 1 [term: 2] ignored a MsgHeartbeatResp message with lower term from 3 [term: 1]
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
INFO 1 [term: 2] ignored a MsgFortifyLeaderResp message with lower term from 3 [term: 1]
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
INFO 1 [term: 2] ignored a MsgFortifyLeaderResp message with lower term from 3 [term: 1]
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
INFO 1 [term: 2] ignored a MsgFortifyLeaderResp message with lower term from 3 [term: 1]
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
INFO 1 [term: 2] ignored a MsgFortifyLeaderResp message with lower term from 3 [term: 1]
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
INFO 1 [term: 2] ignored a MsgFortifyLeaderResp message with lower term from 3 [term: 1]
> 1 handling Ready
Ready MustSync=true:
HardState Term:2 Commit:11 Lead:0 LeadEpoch:0
Expand Down Expand Up @@ -165,7 +207,7 @@ INFO 1 [logterm: 1, index: 11, vote: 0] cast MsgVote for 2 [logterm: 1, index: 1
deliver-msgs 3
----
2->3 MsgVote Term:3 Log:1/11
INFO 3 [logterm: 1, index: 11, vote: 1] ignored MsgVote from 2 [logterm: 1, index: 11] at term 1: recently received communication from leader (remaining ticks: 3)
INFO 3 [logterm: 1, index: 11, vote: 1] ignored MsgVote from 2 [logterm: 1, index: 11] at term 1: recently received communication from leader (remaining ticks: 3) and supporting fortified leader 1 at epoch 2

stabilize
----
Expand Down
Loading

0 comments on commit 648076e

Please sign in to comment.