From 55246746f48216935db8a124c134d484b8d72977 Mon Sep 17 00:00:00 2001 From: Baka <48246993+quertc@users.noreply.github.com> Date: Sat, 7 Dec 2024 01:20:49 +0300 Subject: [PATCH] Feat/empty blocks (#89) * feat: add tenderduty_empty_proposed_blocks metric * feat: add empty_blocks_percentage and consecutive_empty alerting * chore: update config * fix: config * fix: EmptyPercentageAlerts formating * feat: rm lib64 symlink --- example-config.yml | 15 +++++++++ td2/alert.go | 71 ++++++++++++++++++++++++++++++++++++++++++- td2/prometheus.go | 19 ++++++++++-- td2/static/grid.js | 16 ++++++++++ td2/static/index.html | 2 +- td2/types.go | 30 +++++++++++++----- td2/ws.go | 21 ++++++++++++- 7 files changed, 161 insertions(+), 13 deletions(-) diff --git a/example-config.yml b/example-config.yml index b0b9a1c..1aba99f 100644 --- a/example-config.yml +++ b/example-config.yml @@ -95,6 +95,21 @@ chains: # Percentage Missed alert Pagerduty Severity percentage_priority: warning + # Empty blocks notification configuration + consecutive_empty_enabled: yes + # How many consecutive empty blocks should trigger a notification? + consecutive_empty: 3 + # Consecutive Empty alert Pagerduty Severity + consecutive_empty_priority: critical + + # For some Cosmos EVM chains, empty consensus blocks may decrease execution uptime + # since they aren't included in EVM state. Should an alert be sent if empty blocks are detected? + empty_percentage_enabled: yes + # What percentage should trigger the alert + empty_percentage: 2 + # Percentage Empty alert Pagerduty Severity + empty_percentage_priority: warning + # Should an alert be sent if the validator is not in the active set ie, jailed, # tombstoned, unbonding? alert_if_inactive: yes diff --git a/td2/alert.go b/td2/alert.go index 014d057..b6079ba 100644 --- a/td2/alert.go +++ b/td2/alert.go @@ -396,7 +396,7 @@ func (c *Config) alert(chainName, message, severity string, resolved bool, id *s // and also updates a few prometheus stats // FIXME: not watching for nodes that are lagging the head block! func (cc *ChainConfig) watch() { - var missedAlarm, pctAlarm, noNodes bool + var missedAlarm, pctAlarm, noNodes, emptyBlocksAlarm, emptyPctAlarm bool inactive := "jailed" nodeAlarms := make(map[string]bool) @@ -572,6 +572,75 @@ func (cc *ChainConfig) watch() { cc.activeAlerts = alarms.getCount(cc.name) } + // empty blocks alarm handling + if !emptyBlocksAlarm && cc.Alerts.ConsecutiveEmptyAlerts && int(cc.statConsecutiveEmpty) >= cc.Alerts.ConsecutiveEmpty { + // alert on empty blocks counter! + emptyBlocksAlarm = true + id := cc.valInfo.Valcons + "empty" + td.alert( + cc.name, + fmt.Sprintf("%s has proposed %d consecutive empty blocks on %s", cc.valInfo.Moniker, cc.Alerts.ConsecutiveEmpty, cc.ChainId), + cc.Alerts.ConsecutiveEmptyPriority, + false, + &id, + ) + cc.activeAlerts = alarms.getCount(cc.name) + } else if emptyBlocksAlarm && int(cc.statConsecutiveEmpty) < cc.Alerts.ConsecutiveEmpty { + // clear the alert + emptyBlocksAlarm = false + id := cc.valInfo.Valcons + "empty" + td.alert( + cc.name, + fmt.Sprintf("%s has proposed %d consecutive empty blocks on %s", cc.valInfo.Moniker, cc.Alerts.ConsecutiveEmpty, cc.ChainId), + "info", + true, + &id, + ) + cc.activeAlerts = alarms.getCount(cc.name) + } + + // window percentage empty block alarms + var emptyBlocksPercent float64 + if cc.statTotalProps > 0 { + emptyBlocksPercent = 100 * float64(cc.statTotalPropsEmpty) / float64(cc.statTotalProps) + } + + if cc.Alerts.EmptyPercentageAlerts && !emptyPctAlarm && emptyBlocksPercent > float64(cc.Alerts.EmptyWindow) { + // alert on empty block percentage! + emptyPctAlarm = true + id := cc.valInfo.Valcons + "empty_percent" + td.alert( + cc.name, + fmt.Sprintf("%s has > %d%% empty blocks (%d of %d proposed blocks) on %s", + cc.valInfo.Moniker, + cc.Alerts.EmptyWindow, + int(cc.statTotalPropsEmpty), + int(cc.statTotalProps), + cc.ChainId), + cc.Alerts.EmptyPercentagePriority, + false, + &id, + ) + cc.activeAlerts = alarms.getCount(cc.name) + } else if cc.Alerts.EmptyPercentageAlerts && emptyPctAlarm && emptyBlocksPercent < float64(cc.Alerts.EmptyWindow) { + // clear the alert + emptyPctAlarm = false + id := cc.valInfo.Valcons + "empty_percent" + td.alert( + cc.name, + fmt.Sprintf("%s has > %d%% empty blocks (%d of %d proposed blocks) on %s", + cc.valInfo.Moniker, + cc.Alerts.EmptyWindow, + int(cc.statTotalPropsEmpty), + int(cc.statTotalProps), + cc.ChainId), + "info", + true, + &id, + ) + cc.activeAlerts = alarms.getCount(cc.name) + } + // node down alarms for _, node := range cc.Nodes { // window percentage missed block alarms diff --git a/td2/prometheus.go b/td2/prometheus.go index 2e53083..659a0bd 100644 --- a/td2/prometheus.go +++ b/td2/prometheus.go @@ -3,13 +3,14 @@ package tenderduty import ( "context" "fmt" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - "github.com/prometheus/client_golang/prometheus/promhttp" "log" "net/http" "sync" "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/prometheus/client_golang/prometheus/promhttp" ) var ( @@ -25,6 +26,8 @@ const ( metricPrevote metricPrecommit metricConsecutive + metricEmptyBlocks + metricConsecutiveEmpty metricWindowMissed metricWindowSize metricLastBlockSeconds @@ -91,6 +94,14 @@ func prometheusExporter(ctx context.Context, updates chan *promUpdate) { Name: "tenderduty_consecutive_missed_blocks", Help: "the current count of consecutively missed blocks regardless of precommit or prevote status", }, chainLabels) + emptyBlocks := promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "tenderduty_empty_proposed_blocks", + Help: "count of empty blocks proposed (blocks with zero transactions) since tenderduty was started", + }, chainLabels) + consecutiveEmpty := promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "tenderduty_consecutive_empty_blocks", + Help: "the current count of consecutively proposed empty blocks", + }, chainLabels) windowSize := promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "tenderduty_missed_block_window", Help: "the missed block aka slashing window", @@ -135,6 +146,8 @@ func prometheusExporter(ctx context.Context, updates chan *promUpdate) { metricPrevote: missedPrevote, metricPrecommit: missedPrecommit, metricConsecutive: missedConsecutive, + metricEmptyBlocks: emptyBlocks, + metricConsecutiveEmpty: consecutiveEmpty, metricWindowMissed: missedWindow, metricWindowSize: windowSize, metricLastBlockSeconds: lastBlockSec, diff --git a/td2/static/grid.js b/td2/static/grid.js index 7bd05d3..d76374c 100644 --- a/td2/static/grid.js +++ b/td2/static/grid.js @@ -73,6 +73,17 @@ function legend() { offset += 65 * scale grad = ctx.createLinearGradient(offset, 0, offset+gridW, gridH) + grad.addColorStop(0, 'rgb(255,215,0)'); + grad.addColorStop(0.3, 'rgb(255,235,100)'); + grad.addColorStop(0.8, 'rgb(255,223,66)'); + ctx.fillStyle = grad + ctx.fillRect(offset, 0, gridW, gridH) + ctx.fillStyle = 'grey' + offset += gridW + gridW/2 + ctx.fillText("proposer/empty",offset, gridH/1.2) + + offset += 110 * scale + grad = ctx.createLinearGradient(offset, 0, offset+gridW, gridH) grad.addColorStop(0, 'rgba(0,0,0,0.2)'); ctx.fillStyle = grad ctx.fillRect(offset, 0, gridW, gridH) @@ -148,6 +159,11 @@ function drawSeries(multiStates) { crossThrough = false const grad = ctx.createLinearGradient((i*gridW)+gridTextW, (gridH*j), (i * gridW) + gridW +gridTextW, (gridH*j)) switch (multiStates.Status[j].blocks[i]) { + case 5: // empty proposed + grad.addColorStop(0, 'rgb(255,215,0)'); + grad.addColorStop(0.3, 'rgb(255,235,100)'); + grad.addColorStop(0.8, 'rgb(255,223,66)'); + break case 4: // proposed grad.addColorStop(0, 'rgb(123,255,66)'); grad.addColorStop(0.3, 'rgb(240,255,128)'); diff --git a/td2/static/index.html b/td2/static/index.html index fdffb8c..d66e70d 100644 --- a/td2/static/index.html +++ b/td2/static/index.html @@ -28,7 +28,7 @@
- +
diff --git a/td2/types.go b/td2/types.go index a36840e..63984b2 100644 --- a/td2/types.go +++ b/td2/types.go @@ -97,12 +97,14 @@ type ChainConfig struct { lastBlockNum int64 activeAlerts int - statTotalSigns float64 - statTotalProps float64 - statTotalMiss float64 - statPrevoteMiss float64 - statPrecommitMiss float64 - statConsecutiveMiss float64 + statTotalSigns float64 + statTotalProps float64 + statTotalMiss float64 + statPrevoteMiss float64 + statPrecommitMiss float64 + statConsecutiveMiss float64 + statTotalPropsEmpty float64 + statConsecutiveEmpty float64 // ChainId is used to ensure any endpoints contacted claim to be on the correct chain. This is a weak verification, // no light client validation is performed, so caution is advised when using public endpoints. @@ -158,6 +160,20 @@ type AlertConfig struct { // PercentageAlerts is whether to alert on percentage based misses PercentageAlerts bool `yaml:"percentage_enabled"` + // How many consecutive empty blocks are acceptable before alerting + ConsecutiveEmpty int `yaml:"consecutive_empty"` + // Tag for pagerduty to set the alert priority for empty blocks + ConsecutiveEmptyPriority string `yaml:"consecutive_empty_priority"` + // Whether to alert on consecutive empty blocks + ConsecutiveEmptyAlerts bool `yaml:"consecutive_empty_enabled"` + + // EmptyWindow is how many blocks empty as a percentage of proposed blocks since tenderduty was started to trigger an alert + EmptyWindow int `yaml:"empty_percentage"` + // EmptyPercentagePriority is a tag for pagerduty to route on priority + EmptyPercentagePriority string `yaml:"empty_percentage_priority"` + // EmptyPercentageAlerts is whether to alert on percentage based empty blocks + EmptyPercentageAlerts bool `yaml:"empty_percentage_enabled"` + // AlertIfInactive decides if tenderduty send an alert if the validator is not in the active set? AlertIfInactive bool `yaml:"alert_if_inactive"` // AlertIfNoServers: should an alert be sent if no servers are reachable? @@ -319,7 +335,7 @@ func validateConfig(c *Config) (fatal bool, problems []string) { fallthrough case v.Alerts.Telegram.Enabled && !c.Telegram.Enabled: problems = append(problems, fmt.Sprintf("warn: %20s is configured for telegram alerts, but it is not enabled", k)) - case !v.Alerts.ConsecutiveAlerts && !v.Alerts.PercentageAlerts && !v.Alerts.AlertIfInactive && !v.Alerts.AlertIfNoServers: + case !v.Alerts.ConsecutiveAlerts && !v.Alerts.PercentageAlerts && !v.Alerts.AlertIfInactive && !v.Alerts.AlertIfNoServers && !v.Alerts.ConsecutiveEmptyAlerts && !v.Alerts.EmptyPercentageAlerts: problems = append(problems, fmt.Sprintf("warn: %20s has no alert types configured", k)) fallthrough case !v.Alerts.Pagerduty.Enabled && !v.Alerts.Discord.Enabled && !v.Alerts.Telegram.Enabled && !v.Alerts.Slack.Enabled: diff --git a/td2/ws.go b/td2/ws.go index f3d83e8..4a89301 100644 --- a/td2/ws.go +++ b/td2/ws.go @@ -32,6 +32,7 @@ const ( StatusPrecommit StatusSigned StatusProposed + StatusProposedEmpty ) // StatusUpdate is passed over a channel from the websocket client indicating the current state, it is immediate in the @@ -41,6 +42,7 @@ type StatusUpdate struct { Height int64 Status StatusType Final bool + Empty bool } // WsReply is a trimmed down version of the JSON sent from a tendermint websocket subscription. @@ -150,6 +152,13 @@ func (cc *ChainConfig) WsRun() { cc.statTotalProps += 1 cc.statTotalSigns += 1 cc.statConsecutiveMiss = 0 + cc.statConsecutiveEmpty = 0 + case StatusProposedEmpty: + cc.statTotalPropsEmpty += 1 + cc.statTotalProps += 1 + cc.statTotalSigns += 1 + cc.statConsecutiveMiss = 0 + cc.statConsecutiveEmpty += 1 } signState = -1 healthyNodes := 0 @@ -196,6 +205,8 @@ func (cc *ChainConfig) WsRun() { td.statsChan <- cc.mkUpdate(metricPrevote, cc.statPrevoteMiss, "") td.statsChan <- cc.mkUpdate(metricPrecommit, cc.statPrecommitMiss, "") td.statsChan <- cc.mkUpdate(metricConsecutive, cc.statConsecutiveMiss, "") + td.statsChan <- cc.mkUpdate(metricEmptyBlocks, float64(cc.statTotalPropsEmpty), "") + td.statsChan <- cc.mkUpdate(metricConsecutiveEmpty, float64(cc.statConsecutiveEmpty), "") td.statsChan <- cc.mkUpdate(metricUnealthyNodes, float64(len(cc.Nodes)-healthyNodes), "") } } @@ -286,6 +297,9 @@ type rawBlock struct { LastCommit struct { Signatures []signature `json:"signatures"` } `json:"last_commit"` + Data struct { + Txs []json.RawMessage `json:"txs"` + } `json:"data"` } `json:"block"` } @@ -327,9 +341,14 @@ func handleBlocks(ctx context.Context, blocks chan *WsReply, results chan Status Height: b.Block.Header.Height.val(), Status: Statusmissed, Final: true, + Empty: len(b.Block.Data.Txs) == 0, } if b.Block.Header.ProposerAddress == address { - upd.Status = StatusProposed + if upd.Empty { + upd.Status = StatusProposedEmpty + } else { + upd.Status = StatusProposed + } } else if b.find(address) { upd.Status = StatusSigned }