Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport of [NET-3865] [Supportability] Additional Information in the output of 'consul operator raft list-peers' into release/1.16.x #17738

3 changes: 3 additions & 0 deletions .changelog/17582.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:feature
cli: `consul operator raft list-peers` command shows the number of commits each follower is trailing the leader by to aid in troubleshooting.
```
7 changes: 7 additions & 0 deletions agent/consul/operator_raft_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ func (op *Operator) RaftGetConfiguration(args *structs.DCSpecificRequest, reply
serverMap[raft.ServerAddress(addr)] = member
}

serverIDLastIndexMap := make(map[raft.ServerID]uint64)

for _, serverState := range op.srv.autopilot.GetState().Servers {
serverIDLastIndexMap[serverState.Server.ID] = serverState.Stats.LastIndex
}

// Fill out the reply.
leader := op.srv.raft.Leader()
reply.Index = future.Index()
Expand All @@ -66,6 +72,7 @@ func (op *Operator) RaftGetConfiguration(args *structs.DCSpecificRequest, reply
Leader: server.Address == leader,
Voter: server.Suffrage == raft.Voter,
ProtocolVersion: raftProtocolVersion,
LastIndex: serverIDLastIndexMap[server.ID],
}
reply.Servers = append(reply.Servers, entry)
}
Expand Down
13 changes: 13 additions & 0 deletions agent/consul/operator_raft_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@ func TestOperator_RaftGetConfiguration(t *testing.T) {
if len(future.Configuration().Servers) != 1 {
t.Fatalf("bad: %v", future.Configuration().Servers)
}

serverIDLastIndexMap := make(map[raft.ServerID]uint64)

for _, serverState := range s1.autopilot.GetState().Servers {
serverIDLastIndexMap[serverState.Server.ID] = serverState.Stats.LastIndex
}

me := future.Configuration().Servers[0]
expected := structs.RaftConfigurationResponse{
Servers: []*structs.RaftServer{
Expand All @@ -60,6 +67,7 @@ func TestOperator_RaftGetConfiguration(t *testing.T) {
Leader: true,
Voter: true,
ProtocolVersion: "3",
LastIndex: serverIDLastIndexMap[me.ID],
},
},
Index: future.Index(),
Expand Down Expand Up @@ -113,6 +121,10 @@ func TestOperator_RaftGetConfiguration_ACLDeny(t *testing.T) {
if len(future.Configuration().Servers) != 1 {
t.Fatalf("bad: %v", future.Configuration().Servers)
}
serverIDLastIndexMap := make(map[raft.ServerID]uint64)
for _, serverState := range s1.autopilot.GetState().Servers {
serverIDLastIndexMap[serverState.Server.ID] = serverState.Stats.LastIndex
}
me := future.Configuration().Servers[0]
expected := structs.RaftConfigurationResponse{
Servers: []*structs.RaftServer{
Expand All @@ -123,6 +135,7 @@ func TestOperator_RaftGetConfiguration_ACLDeny(t *testing.T) {
Leader: true,
Voter: true,
ProtocolVersion: "3",
LastIndex: serverIDLastIndexMap[me.ID],
},
},
Index: future.Index(),
Expand Down
3 changes: 3 additions & 0 deletions agent/structs/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ type RaftServer struct {
// it's a non-voting server, which will be added in a future release of
// Consul.
Voter bool

// LastIndex is the last log index this server has a record of in its Raft log.
LastIndex uint64
}

// RaftConfigurationResponse is returned when querying for the current Raft
Expand Down
3 changes: 3 additions & 0 deletions api/operator_raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ type RaftServer struct {
// it's a non-voting server, which will be added in a future release of
// Consul.
Voter bool

// LastIndex is the last log index this server has a record of in its Raft log.
LastIndex uint64
}

// RaftConfiguration is returned when querying for the current Raft configuration.
Expand Down
34 changes: 31 additions & 3 deletions command/operator/raft/listpeers/operator_raft_list.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,24 @@ func raftListPeers(client *api.Client, stale bool) (string, error) {
return "", fmt.Errorf("Failed to retrieve raft configuration: %v", err)
}

leaderLastCommitIndex := uint64(0)
serverIdLastIndexMap := make(map[string]uint64)

for _, raftServer := range reply.Servers {
serverIdLastIndexMap[raftServer.ID] = raftServer.LastIndex
}

for _, s := range reply.Servers {
if s.Leader {
lastIndex, ok := serverIdLastIndexMap[s.ID]
if ok {
leaderLastCommitIndex = lastIndex
}
}
}

// Format it as a nice table.
result := []string{"Node\x1fID\x1fAddress\x1fState\x1fVoter\x1fRaftProtocol"}
result := []string{"Node\x1fID\x1fAddress\x1fState\x1fVoter\x1fRaftProtocol\x1fCommit Index\x1fTrails Leader By"}
for _, s := range reply.Servers {
raftProtocol := s.ProtocolVersion

Expand All @@ -82,8 +98,20 @@ func raftListPeers(client *api.Client, stale bool) (string, error) {
if s.Leader {
state = "leader"
}
result = append(result, fmt.Sprintf("%s\x1f%s\x1f%s\x1f%s\x1f%v\x1f%s",
s.Node, s.ID, s.Address, state, s.Voter, raftProtocol))

trailsLeaderByText := "-"
serverLastIndex, ok := serverIdLastIndexMap[s.ID]
if ok {
trailsLeaderBy := leaderLastCommitIndex - serverLastIndex
trailsLeaderByText = fmt.Sprintf("%d commits", trailsLeaderBy)
if s.Leader {
trailsLeaderByText = "-"
} else if trailsLeaderBy == 1 {
trailsLeaderByText = fmt.Sprintf("%d commit", trailsLeaderBy)
}
}
result = append(result, fmt.Sprintf("%s\x1f%s\x1f%s\x1f%s\x1f%v\x1f%s\x1f%v\x1f%s",
s.Node, s.ID, s.Address, state, s.Voter, raftProtocol, serverLastIndex, trailsLeaderByText))
}

return columnize.Format(result, &columnize.Config{Delim: string([]byte{0x1f})}), nil
Expand Down
2 changes: 1 addition & 1 deletion command/operator/raft/listpeers/operator_raft_list_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ func TestOperatorRaftListPeersCommand(t *testing.T) {
a := agent.NewTestAgent(t, ``)
defer a.Shutdown()

expected := fmt.Sprintf("%s %s 127.0.0.1:%d leader true 3",
expected := fmt.Sprintf("%s %s 127.0.0.1:%d leader true 3 1 -",
a.Config.NodeName, a.Config.NodeID, a.Config.ServerPort)

// Test the list-peers subcommand directly
Expand Down
12 changes: 6 additions & 6 deletions website/content/commands/operator/raft.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@ Usage: `consul operator raft list-peers -stale=[true|false]`
The output looks like this:

```text
Node ID Address State Voter RaftProtocol
alice 127.0.0.1:8300 127.0.0.1:8300 follower true 2
bob 127.0.0.2:8300 127.0.0.2:8300 leader true 3
carol 127.0.0.3:8300 127.0.0.3:8300 follower true 2
Node ID Address State Voter RaftProtocol Commit Index Trails Leader By
alice 127.0.0.1:8300 127.0.0.1:8300 follower true 2 1167 0 commits
bob 127.0.0.2:8300 127.0.0.2:8300 leader true 3 1167 -
carol 127.0.0.3:8300 127.0.0.3:8300 follower true 2 1159 8 commits
```

`Node` is the node name of the server, as known to Consul, or "(unknown)" if
Expand All @@ -70,7 +70,7 @@ configuration.

- `-stale` - Enables non-leader servers to provide cluster state information.
If the cluster is in an outage state without a leader,
we recommend setting this option to `true.
we recommend setting this option to `true`.
Default is `false`.

## remove-peer
Expand Down Expand Up @@ -109,7 +109,7 @@ The return code will indicate success or failure.

Corresponding HTTP API Endpoint: [\[POST\] /v1/operator/raft/transfer-leader](/consul/api-docs/operator/raft#transfer-raft-leadership)

This command transfers Raft leadership to another server agent. If an `id` is provided, Consul transfers leadership to the server with that id.
This command transfers Raft leadership to another server agent. If an `id` is provided, Consul transfers leadership to the server with that id.

Use this command to change leadership without restarting the leader node, which maintains quorum and workload capacity.

Expand Down