Skip to content

Commit

Permalink
raft: refortify followers if they are not fortified
Browse files Browse the repository at this point in the history
This commit makes the leader refortify followers that needs
fortification. It keeps checking on every heartbeat timeout. Moreover,
the leader now skips sending fortification messages to followers whose
stores don't provide support in the store-liveness-fabric.
  • Loading branch information
iskettaneh committed Sep 13, 2024
1 parent 37d4c58 commit 447403f
Show file tree
Hide file tree
Showing 8 changed files with 343 additions and 25 deletions.
1 change: 1 addition & 0 deletions pkg/raft/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ go_test(
"//pkg/raft/rafttest",
"//pkg/raft/tracker",
"//pkg/settings/cluster",
"//pkg/testutils",
"@com_github_cockroachdb_datadriven//:datadriven",
"@com_github_stretchr_testify//assert",
"@com_github_stretchr_testify//require",
Expand Down
5 changes: 5 additions & 0 deletions pkg/raft/raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -1028,6 +1028,11 @@ func (r *raft) tickHeartbeat() {
if err := r.Step(pb.Message{From: r.id, Type: pb.MsgBeat}); err != nil {
r.logger.Debugf("error occurred during checking sending heartbeat: %v", err)
}

// Try to refortify any followers that don't currently support us.
r.maybeBcastFortify()
// TODO(ibrahim): add/call maybeUnpauseAndBcastAppend() here.

}
}

Expand Down
53 changes: 38 additions & 15 deletions pkg/raft/raft_paper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import (
"testing"

pb "github.com/cockroachdb/cockroach/pkg/raft/raftpb"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
Expand Down Expand Up @@ -105,23 +106,45 @@ func TestStartAsFollower(t *testing.T) {
func TestLeaderBcastBeat(t *testing.T) {
// heartbeat interval
hi := 1
r := newTestRaft(1, 10, hi, newTestMemoryStorage(withPeers(1, 2, 3)))
r.becomeCandidate()
r.becomeLeader()
for i := 0; i < 10; i++ {
mustAppendEntry(r, pb.Entry{Index: uint64(i) + 1})
}

for i := 0; i < hi; i++ {
r.tick()
}
testutils.RunTrueAndFalse(t, "store-liveness-enabled",
func(t *testing.T, storeLivenessEnabled bool) {
var r *raft
if storeLivenessEnabled {
r = newTestRaft(1, 10, hi,
newTestMemoryStorage(withPeers(1, 2, 3)))
} else {
r = newTestRaft(1, 10, hi,
newTestMemoryStorage(withPeers(1, 2, 3)), withFortificationDisabled())
}

msgs := r.readMessages()
sort.Sort(messageSlice(msgs))
assert.Equal(t, []pb.Message{
{From: 1, To: 2, Term: 1, Type: pb.MsgHeartbeat},
{From: 1, To: 3, Term: 1, Type: pb.MsgHeartbeat},
}, msgs)
r.becomeCandidate()
r.becomeLeader()

for i := 0; i < 10; i++ {
mustAppendEntry(r, pb.Entry{Index: uint64(i) + 1})
}

for i := 0; i < hi; i++ {
r.tick()
}

msgs := r.readMessages()
sort.Sort(messageSlice(msgs))
if storeLivenessEnabled {
assert.Equal(t, []pb.Message{
{From: 1, To: 2, Term: 1, Type: pb.MsgFortifyLeader},
{From: 1, To: 3, Term: 1, Type: pb.MsgFortifyLeader},
{From: 1, To: 2, Term: 1, Type: pb.MsgHeartbeat},
{From: 1, To: 3, Term: 1, Type: pb.MsgHeartbeat},
}, msgs)
} else {
assert.Equal(t, []pb.Message{
{From: 1, To: 2, Term: 1, Type: pb.MsgHeartbeat},
{From: 1, To: 3, Term: 1, Type: pb.MsgHeartbeat},
}, msgs)
}
})
}

func TestFollowerStartElection(t *testing.T) {
Expand Down
19 changes: 16 additions & 3 deletions pkg/raft/testdata/async_storage_writes_append_aba_race.txt
Original file line number Diff line number Diff line change
Expand Up @@ -414,20 +414,32 @@ Messages:
4->5 MsgHeartbeat Term:3 Log:0/0
4->6 MsgHeartbeat Term:3 Log:0/0
4->7 MsgHeartbeat Term:3 Log:0/0
4->1 MsgFortifyLeader Term:3 Log:0/0
4->2 MsgFortifyLeader Term:3 Log:0/0
4->3 MsgFortifyLeader Term:3 Log:0/0
4->5 MsgFortifyLeader Term:3 Log:0/0
4->6 MsgFortifyLeader Term:3 Log:0/0
4->7 MsgFortifyLeader Term:3 Log:0/0
4->AppendThread MsgStorageAppend Term:0 Log:0/0 Responses:[
4->4 MsgFortifyLeaderResp Term:3 Log:0/0 LeadEpoch:1
]

deliver-msgs 1
----
4->1 MsgHeartbeat Term:3 Log:0/0
INFO 1 [term: 2] received a MsgHeartbeat message with higher term from 4 [term: 3]
INFO 1 became follower at term 3
4->1 MsgFortifyLeader Term:3 Log:0/0

process-ready 1
----
Ready MustSync=true:
HardState Term:3 Commit:11 Lead:4 LeadEpoch:0
HardState Term:3 Commit:11 Lead:4 LeadEpoch:1
Messages:
1->4 MsgHeartbeatResp Term:3 Log:0/0
1->AppendThread MsgStorageAppend Term:3 Log:0/0 Commit:11 Lead:4
1->AppendThread MsgStorageAppend Term:3 Log:0/0 Commit:11 Lead:4 LeadEpoch:1 Responses:[
1->4 MsgFortifyLeaderResp Term:3 Log:0/0 LeadEpoch:1
]

deliver-msgs 4
----
Expand Down Expand Up @@ -513,8 +525,9 @@ INFO mark (term,index)=(2,12) mismatched the last accepted term 3 in unstable lo
process-append-thread 1
----
Processing:
1->AppendThread MsgStorageAppend Term:3 Log:0/0 Commit:11 Lead:4
1->AppendThread MsgStorageAppend Term:3 Log:0/0 Commit:11 Lead:4 LeadEpoch:1
Responses:
1->4 MsgFortifyLeaderResp Term:3 Log:0/0 LeadEpoch:1

raft-log 1
----
Expand Down
48 changes: 45 additions & 3 deletions pkg/raft/testdata/checkquorum.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,31 +73,57 @@ INFO 1 became follower at term 1
stabilize
----
> 1 handling Ready
Ready MustSync=false:
Ready MustSync=true:
State:StateFollower
HardState Term:1 Vote:1 Commit:11 Lead:1 LeadEpoch:2
Messages:
1->2 MsgHeartbeat Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->2 MsgHeartbeat Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->2 MsgHeartbeat Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->2 MsgHeartbeat Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->2 MsgHeartbeat Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
> 2 receiving messages
1->2 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
INFO 2 [term: 2] ignored a MsgFortifyLeader message with lower term from 1 [term: 1]
1->2 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
INFO 2 [term: 2] ignored a MsgFortifyLeader message with lower term from 1 [term: 1]
1->2 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
INFO 2 [term: 2] ignored a MsgFortifyLeader message with lower term from 1 [term: 1]
1->2 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
INFO 2 [term: 2] ignored a MsgFortifyLeader message with lower term from 1 [term: 1]
1->2 MsgHeartbeat Term:1 Log:0/0
1->2 MsgFortifyLeader Term:1 Log:0/0
INFO 2 [term: 2] ignored a MsgFortifyLeader message with lower term from 1 [term: 1]
> 3 receiving messages
1->3 MsgHeartbeat Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
1->3 MsgHeartbeat Term:1 Log:0/0
1->3 MsgFortifyLeader Term:1 Log:0/0
> 2 handling Ready
Ready MustSync=false:
Messages:
Expand All @@ -107,13 +133,19 @@ stabilize
2->1 MsgAppResp Term:2 Log:0/0
2->1 MsgAppResp Term:2 Log:0/0
> 3 handling Ready
Ready MustSync=false:
Ready MustSync=true:
HardState Term:1 Vote:1 Commit:11 Lead:1 LeadEpoch:2
Messages:
3->1 MsgHeartbeatResp Term:1 Log:0/0
3->1 MsgHeartbeatResp Term:1 Log:0/0
3->1 MsgHeartbeatResp Term:1 Log:0/0
3->1 MsgHeartbeatResp Term:1 Log:0/0
3->1 MsgHeartbeatResp Term:1 Log:0/0
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
> 1 receiving messages
2->1 MsgAppResp Term:2 Log:0/0
INFO 1 [term: 1] received a MsgAppResp message with higher term from 2 [term: 2]
Expand All @@ -132,6 +164,16 @@ stabilize
INFO 1 [term: 2] ignored a MsgHeartbeatResp message with lower term from 3 [term: 1]
3->1 MsgHeartbeatResp Term:1 Log:0/0
INFO 1 [term: 2] ignored a MsgHeartbeatResp message with lower term from 3 [term: 1]
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
INFO 1 [term: 2] ignored a MsgFortifyLeaderResp message with lower term from 3 [term: 1]
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
INFO 1 [term: 2] ignored a MsgFortifyLeaderResp message with lower term from 3 [term: 1]
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
INFO 1 [term: 2] ignored a MsgFortifyLeaderResp message with lower term from 3 [term: 1]
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
INFO 1 [term: 2] ignored a MsgFortifyLeaderResp message with lower term from 3 [term: 1]
3->1 MsgFortifyLeaderResp Term:1 Log:0/0 LeadEpoch:2
INFO 1 [term: 2] ignored a MsgFortifyLeaderResp message with lower term from 3 [term: 1]
> 1 handling Ready
Ready MustSync=true:
HardState Term:2 Commit:11 Lead:0 LeadEpoch:0
Expand Down Expand Up @@ -165,7 +207,7 @@ INFO 1 [logterm: 1, index: 11, vote: 0] cast MsgVote for 2 [logterm: 1, index: 1
deliver-msgs 3
----
2->3 MsgVote Term:3 Log:1/11
INFO 3 [logterm: 1, index: 11, vote: 1] ignored MsgVote from 2 [logterm: 1, index: 11] at term 1: recently received communication from leader (remaining ticks: 3)
INFO 3 [logterm: 1, index: 11, vote: 1] ignored MsgVote from 2 [logterm: 1, index: 11] at term 1: recently received communication from leader (remaining ticks: 3) and supporting fortified leader 1 at epoch 2

stabilize
----
Expand Down
1 change: 0 additions & 1 deletion pkg/raft/testdata/fortification_basic.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ withdraw-support 3 1
2 2 1 1
3 x 1 1


campaign 1
----
INFO 1 is starting a new election at term 0
Expand Down
Loading

0 comments on commit 447403f

Please sign in to comment.