From 67ab7d6665980026d9d87fefa9aa433deee875bb Mon Sep 17 00:00:00 2001 From: Peter Nose Date: Mon, 2 Oct 2023 05:59:59 +0200 Subject: [PATCH] go/cometbft/apps/roothash: Process liveness statistics for workers only --- .../cometbft/apps/roothash/finalization.go | 22 ++++---- .../cometbft/apps/roothash/liveness.go | 53 ++++++++----------- .../cometbft/apps/roothash/liveness_test.go | 16 +++++- go/roothash/tests/tester.go | 19 +++---- 4 files changed, 54 insertions(+), 56 deletions(-) diff --git a/go/consensus/cometbft/apps/roothash/finalization.go b/go/consensus/cometbft/apps/roothash/finalization.go index a331ed650a1..70565d57c61 100644 --- a/go/consensus/cometbft/apps/roothash/finalization.go +++ b/go/consensus/cometbft/apps/roothash/finalization.go @@ -15,7 +15,6 @@ import ( roothash "github.com/oasisprotocol/oasis-core/go/roothash/api" "github.com/oasisprotocol/oasis-core/go/roothash/api/block" "github.com/oasisprotocol/oasis-core/go/roothash/api/commitment" - scheduler "github.com/oasisprotocol/oasis-core/go/scheduler/api" staking "github.com/oasisprotocol/oasis-core/go/staking/api" ) @@ -181,23 +180,20 @@ func (app *rootHashApplication) tryFinalizeRoundInsideTx( //nolint: gocyclo schedulerVote := sc.Commitment.ToVote() for i, n := range rtState.Committee.Members { vote, ok := sc.Votes[n.PublicKey] - // Make sure to not include nodes in multiple roles multiple times. - _, wasSeen := seen[n.PublicKey] - seen[n.PublicKey] = struct{}{} switch { - case !ok && n.Role == scheduler.RoleBackupWorker && !pool.Discrepancy && !wasSeen: - // This is a backup worker only that did not submit a commitment and there was no - // discrepancy. Count the worker as live. - // - // Note that this skips the case where the node is both primary and backup and the - // primary did not commit as that should be treated as failure. - livenessStats.LiveRounds[i]++ + case !ok: continue - case !ok || vote == nil || wasSeen: + case vote == nil: + // Skip failures. continue - default: } + // Make sure to not include nodes in multiple roles multiple times. + if _, ok := seen[n.PublicKey]; ok { + continue + } + seen[n.PublicKey] = struct{}{} + // Resolve the entity owning the node. var node *node.Node node, err = regState.Node(ctx, n.PublicKey) diff --git a/go/consensus/cometbft/apps/roothash/liveness.go b/go/consensus/cometbft/apps/roothash/liveness.go index 993e5b794b7..0ebbaba694c 100644 --- a/go/consensus/cometbft/apps/roothash/liveness.go +++ b/go/consensus/cometbft/apps/roothash/liveness.go @@ -5,11 +5,11 @@ import ( "math" beacon "github.com/oasisprotocol/oasis-core/go/beacon/api" - "github.com/oasisprotocol/oasis-core/go/common/crypto/signature" tmapi "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/api" registryState "github.com/oasisprotocol/oasis-core/go/consensus/cometbft/apps/registry/state" registry "github.com/oasisprotocol/oasis-core/go/registry/api" roothash "github.com/oasisprotocol/oasis-core/go/roothash/api" + "github.com/oasisprotocol/oasis-core/go/scheduler/api" staking "github.com/oasisprotocol/oasis-core/go/staking/api" ) @@ -42,51 +42,42 @@ func processLivenessStatistics(ctx *tmapi.Context, epoch beacon.EpochTime, rtSta "slash_amount", slashParams.Amount, ) - // Collect per node liveness statistics as a single node can have multiple roles. - type Stats struct { - liveRounds uint64 - finalizedProposals uint64 - missedProposals uint64 - } - statsPerNode := make(map[signature.PublicKey]*Stats) - for i, member := range rtState.Committee.Members { - stats, ok := statsPerNode[member.PublicKey] - if !ok { - stats = &Stats{} - statsPerNode[member.PublicKey] = stats + // Penalize worker nodes that were not live enough. + regState := registryState.NewMutableState(ctx.State()) + for i, n := range rtState.Committee.Members { + if n.Role != api.RoleWorker { + // Workers are listed before backup workers. + break } - stats.liveRounds += rtState.LivenessStatistics.LiveRounds[i] - stats.finalizedProposals += rtState.LivenessStatistics.FinalizedProposals[i] - stats.missedProposals += rtState.LivenessStatistics.MissedProposals[i] - } - // Penalize nodes that were not live enough. - regState := registryState.NewMutableState(ctx.State()) - for nodeID, stats := range statsPerNode { - status, err := regState.NodeStatus(ctx, nodeID) + status, err := regState.NodeStatus(ctx, n.PublicKey) if err != nil { - return fmt.Errorf("failed to retrieve status for node %s: %w", nodeID, err) + return fmt.Errorf("failed to retrieve status for node %s: %w", n.PublicKey, err) } if status.IsSuspended(rtState.Runtime.ID, epoch) { continue } - maxMissedProposals := ((stats.missedProposals + stats.finalizedProposals) * maxMissedProposalsPercent) / 100 + liveRounds := rtState.LivenessStatistics.LiveRounds[i] + finalizedProposals := rtState.LivenessStatistics.FinalizedProposals[i] + missedProposals := rtState.LivenessStatistics.MissedProposals[i] + + maxMissedProposals := ((missedProposals + finalizedProposals) * maxMissedProposalsPercent) / 100 if maxMissedProposalsPercent == 0 { maxMissedProposals = math.MaxUint64 } switch { - case stats.liveRounds >= minLiveRounds && stats.missedProposals <= maxMissedProposals: + case liveRounds >= minLiveRounds && missedProposals <= maxMissedProposals: // Node is live. status.RecordSuccess(rtState.Runtime.ID, epoch) default: // Node is faulty. ctx.Logger().Debug("node deemed faulty", - "node_id", nodeID, - "live_rounds", stats.liveRounds, + "node_id", n.PublicKey, + "live_rounds", liveRounds, "min_live_rounds", minLiveRounds, - "missed_proposals", stats.missedProposals, + "missed_proposals", missedProposals, "max_missed_proposals", maxMissedProposals, ) @@ -103,15 +94,15 @@ func processLivenessStatistics(ctx *tmapi.Context, epoch beacon.EpochTime, rtSta } // Slash if configured. - err = onRuntimeLivenessFailure(ctx, nodeID, &slashParams.Amount) + err = onRuntimeLivenessFailure(ctx, n.PublicKey, &slashParams.Amount) if err != nil { - return fmt.Errorf("failed to slash node %s: %w", nodeID, err) + return fmt.Errorf("failed to slash node %s: %w", n.PublicKey, err) } } } - if err = regState.SetNodeStatus(ctx, nodeID, status); err != nil { - return fmt.Errorf("failed to set node status for node %s: %w", nodeID, err) + if err = regState.SetNodeStatus(ctx, n.PublicKey, status); err != nil { + return fmt.Errorf("failed to set node status for node %s: %w", n.PublicKey, err) } } diff --git a/go/consensus/cometbft/apps/roothash/liveness_test.go b/go/consensus/cometbft/apps/roothash/liveness_test.go index 3535c2e59f2..40218b44b18 100644 --- a/go/consensus/cometbft/apps/roothash/liveness_test.go +++ b/go/consensus/cometbft/apps/roothash/liveness_test.go @@ -131,7 +131,21 @@ func TestLivenessProcessing(t *testing.T) { // Bump epoch so the node is no longer suspended. epoch += 2 - // When node is live again, fault counter should decrease. + // When node is a backup worker, fault counter should not change. + rtState.Committee.Members[0].Role = scheduler.RoleBackupWorker + rtState.LivenessStatistics.LiveRounds[0] = 91 // At least 90 required. + err = processLivenessStatistics(ctx, epoch, rtState) + require.NoError(err, "processLivenessStatistics") + status, err = registryState.NodeStatus(ctx, sk.Public()) + require.NoError(err, "NodeStatus") + require.False(status.IsSuspended(runtime.ID, epoch), "node should not be suspended") + require.EqualValues(1, status.Faults[runtime.ID].Failures, "there should be one fault") + + // Bump epoch so the node is no longer suspended. + epoch += 2 + + // When node is worker again, fault counter should decrease. + rtState.Committee.Members[0].Role = scheduler.RoleWorker rtState.LivenessStatistics.LiveRounds[0] = 91 // At least 90 required. err = processLivenessStatistics(ctx, epoch, rtState) require.NoError(err, "processLivenessStatistics") diff --git a/go/roothash/tests/tester.go b/go/roothash/tests/tester.go index 2b1f2942deb..204821c0784 100644 --- a/go/roothash/tests/tester.go +++ b/go/roothash/tests/tester.go @@ -513,25 +513,22 @@ func (s *runtimeState) testSuccessfulRound(t *testing.T, backend api.Backend, co // Check that the liveness statistics were computed correctly. livenessStatistics := s.livenessStatisticsDiff(t, ctx, backend, parent.Height) - goodRoundsPerNode := make(map[signature.PublicKey]uint64) - for i, member := range s.executorCommittee.committee.Members { - goodRoundsPerNode[member.PublicKey] += livenessStatistics.LiveRounds[i] - } - - for nodeID, v := range goodRoundsPerNode { - // Workers and backup workers should be considered live as everyone submitted - // commitments and there were no discrepancies. - require.EqualValues(1, v, "LiveRounds(%s)", nodeID) - } - + liveRounds := make([]uint64, len(livenessStatistics.LiveRounds)) finalizedProposals := make([]uint64, len(livenessStatistics.FinalizedProposals)) missedProposals := make([]uint64, len(livenessStatistics.MissedProposals)) + // All workers and none backup workers should be considered live as every worker submitted + // a commitment and there were no discrepancies. + for i := range s.executorCommittee.workers { + liveRounds[i] = 1 + } + schedulerIdx, err := s.executorCommittee.committee.SchedulerIdx(parent.Block.Header.Round, 0) require.NoError(err, "SchedulerIdx") finalizedProposals[schedulerIdx]++ require.Equal(uint64(1), livenessStatistics.TotalRounds, "there should be one finalized round") + require.EqualValues(liveRounds, livenessStatistics.LiveRounds, "there should be no live members") require.EqualValues(finalizedProposals, livenessStatistics.FinalizedProposals, "there should be one finalized proposal") require.EqualValues(missedProposals, livenessStatistics.MissedProposals, "there should be no failed proposals") }