Skip to content

Commit

Permalink
v2.0.2
Browse files Browse the repository at this point in the history
Various bug fixes

fixes a channel deadlock when > 32 chains monitored (thanks Shultzie for the report)
problems with restoring state for node-down alerts (thanks m8nky for the report)
better alarm deduplication
a few little things flagged by snyk, gocritic, and staticcheck
  • Loading branch information
blockpane authored Jul 6, 2022
2 parents 631fc73 + 4212a20 commit dfd932b
Show file tree
Hide file tree
Showing 17 changed files with 325 additions and 106 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ name: Docker
on:
push:
branches:
- "release/*"
#- "release/*"
- main
# Publish semver tags as releases.
tags:
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
config.yml
docker-compose.yml
.tenderduty-state.json
tenderduty-*
tenderduty
local*

.idea
.DS_Store
Expand Down
2 changes: 1 addition & 1 deletion docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ chains:
| `chain."name".alerts.percentage_missed` | What percentage should trigger the alert? |
| `chain."name".alerts.percentage_priority` | NOT USED: future hint for pagerduty's routing. |
| `chain."name".alerts.alert_if_inactive` | Should an alert be sent if the validator is not in the active set: jailed, tombstoned, or unbonding? |
| `chain."name".alerts.alert_if_no_servers` | Should an alert be sent if no RPC servers are responding? (Note this alarm is instantaneous with no delay) |
| `chain."name".alerts.alert_if_no_servers` | Should an alert be sent if no RPC servers are responding? (Note this alarm uses the node_down_alert_minutes setting) |
| `chain."name".alerts.pagerduty.*` | This section is the same as the pagerduty structure above. It allows disabling or enabling specific settings on a per-chain basis. Including routing to a different destination. If the api_key is blank it will use the settings defined in `pagerduty.*` <br />*Note both `pagerduty.enabled` and `chain."name".alerts.pagerduty.enabled` must be 'yes' to get alerts.* |
| `chain."name".alerts.discord.*` | This section is the same as the discord structure above. It allows disabling or enabling specific settings on a per-chain basis. Including routing to a different destination. If the webhook is blank it will use the settings defined in `discord.*` <br />*Note both `discord.enabled` and `chain."name".alerts.discord.enabled` must be 'yes' to get alerts.* |
| `chain."name".alerts.telegram.*` | This section is the same as the telegram structure above. It allows disabling or enabling specific settings on a per-chain basis. Including routing to a different destination. If the api_key and channel are blank it will use the settings defined in `telegram.*` <br />*Note both `telegram.enabled` and `chain."name".alerts.telegram.enabled` must be 'yes' to get alerts.* |
Expand Down
2 changes: 1 addition & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ func main() {
os.Exit(0)
}

err := td2.Run(configFile, stateFile, dumpConfig)
err := td2.Run(configFile, stateFile)
if err != nil {
log.Println(err.Error(), "... exiting.")
}
Expand Down
77 changes: 48 additions & 29 deletions td2/alert.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,22 @@ func shouldNotify(msg *alertMsg, dest notifyDest) bool {
whichMap = alarms.SentDiAlarms
service = "Discord"
}
if !whichMap[msg.message].IsZero() && !msg.resolved {

switch {
case !whichMap[msg.message].IsZero() && !msg.resolved:
// already sent this alert
return false
} else if !whichMap[msg.message].IsZero() && msg.resolved {
case !whichMap[msg.message].IsZero() && msg.resolved:
// alarm is cleared
delete(whichMap, msg.message)
l(fmt.Sprintf("💜 Resolved alarm on %s (%s) - notifying %s", msg.chain, msg.message, service))
return true
case msg.resolved:
// it looks like we got a duplicate resolution or suppressed it. Note it and move on:
l(fmt.Sprintf("😕 Not clearing alarm on %s (%s) - no corresponding alert %s", msg.chain, msg.message, service))
return false
}

// check if the alarm is flapping, if we sent the same alert in the last five minutes, show a warning but don't alert
if alarms.flappingAlarms[msg.chain] == nil {
alarms.flappingAlarms[msg.chain] = make(map[string]time.Time)
Expand Down Expand Up @@ -138,11 +145,6 @@ func notifyDiscord(msg *alertMsg) (err error) {

if resp.StatusCode != 204 {
log.Println(resp)
//if resp.Body != nil {
// b, _ := ioutil.ReadAll(resp.Body)
// _ = resp.Body.Close()
// fmt.Println(string(b))
//}
l("notify discord:", err)
return err
}
Expand Down Expand Up @@ -184,7 +186,6 @@ func notifyTg(msg *alertMsg) (err error) {
if !shouldNotify(msg, tg) {
return nil
}
//tgbotapi "github.com/go-telegram-bot-api/telegram-bot-api/v5"
bot, err := tgbotapi.NewBotAPI(msg.tgKey)
if err != nil {
l("notify telegram:", err)
Expand All @@ -197,7 +198,6 @@ func notifyTg(msg *alertMsg) (err error) {
}

mc := tgbotapi.NewMessageToChannel(msg.tgChannel, fmt.Sprintf("%s: %s - %s", msg.chain, prefix, msg.message))
//mc.ParseMode = "html"
_, err = bot.Send(mc)
if err != nil {
l("telegram send:", err)
Expand Down Expand Up @@ -240,7 +240,6 @@ func getAlarms(chain string) string {
alarms.notifyMux.RLock()
defer alarms.notifyMux.RUnlock()
// don't show this info if the logs are disabled on the dashboard, potentially sensitive info could be leaked.
//if td.HideLogs || currentAlarms[chain] == nil {
if td.HideLogs || alarms.AllAlarms[chain] == nil {
return ""
}
Expand Down Expand Up @@ -299,10 +298,11 @@ func (cc *ChainConfig) watch() {
nodeAlarms := make(map[string]bool)

// wait until we have a moniker:
noNodesSec := 0 // delay a no-nodes alarm for 30 seconds, too noisy.
for {
if cc.valInfo == nil || cc.valInfo.Moniker == "not connected" {
time.Sleep(time.Second)
if cc.Alerts.AlertIfNoServers && !noNodes && cc.noNodes {
if cc.Alerts.AlertIfNoServers && !noNodes && cc.noNodes && noNodesSec >= 60*td.NodeDownMin {
noNodes = true
td.alert(
cc.name,
Expand All @@ -312,8 +312,10 @@ func (cc *ChainConfig) watch() {
&cc.valInfo.Valcons,
)
}
noNodesSec += 1
continue
}
noNodesSec = 0
break
}
// initial stat creation for nodes, we only update again if the node is positive
Expand All @@ -327,16 +329,26 @@ func (cc *ChainConfig) watch() {
time.Sleep(2 * time.Second)

// alert if we can't monitor
if cc.Alerts.AlertIfNoServers && !noNodes && cc.noNodes {
noNodes = true
td.alert(
cc.name,
fmt.Sprintf("no RPC endpoints are working for %s", cc.ChainId),
"critical",
false,
&cc.valInfo.Valcons,
)
} else if cc.Alerts.AlertIfNoServers && noNodes && !cc.noNodes {
switch {
case cc.Alerts.AlertIfNoServers && !noNodes && cc.noNodes:
noNodesSec += 2
if noNodesSec <= 30*td.NodeDownMin {
if noNodesSec%20 == 0 {
l(fmt.Sprintf("no nodes available on %s for %d seconds, deferring alarm", cc.ChainId, noNodesSec))
}
noNodes = false
} else {
noNodesSec = 0
noNodes = true
td.alert(
cc.name,
fmt.Sprintf("no RPC endpoints are working for %s", cc.ChainId),
"critical",
false,
&cc.valInfo.Valcons,
)
}
case cc.Alerts.AlertIfNoServers && noNodes && !cc.noNodes:
noNodes = false
td.alert(
cc.name,
Expand All @@ -345,6 +357,8 @@ func (cc *ChainConfig) watch() {
true,
&cc.valInfo.Valcons,
)
default:
noNodesSec = 0
}

// stalled chain detection
Expand Down Expand Up @@ -428,7 +442,6 @@ func (cc *ChainConfig) watch() {
}

// window percentage missed block alarms
//fmt.Println(100*float64(cc.valInfo.Missed)/float64(cc.valInfo.Window), float64(cc.Alerts.Window))
if cc.Alerts.PercentageAlerts && !pctAlarm && 100*float64(cc.valInfo.Missed)/float64(cc.valInfo.Window) > float64(cc.Alerts.Window) {
// alert on missed block counter!
pctAlarm = true
Expand Down Expand Up @@ -458,21 +471,27 @@ func (cc *ChainConfig) watch() {
// node down alarms
for _, node := range cc.Nodes {
// window percentage missed block alarms
if node.AlertIfDown && node.down && !nodeAlarms[node.Url] && !node.downSince.IsZero() && time.Now().Sub(node.downSince).Minutes() > float64(td.NodeDownMin) {
if node.AlertIfDown && node.down && !node.wasDown && !node.downSince.IsZero() &&
time.Since(node.downSince) > time.Duration(td.NodeDownMin)*time.Minute {
// alert on dead node
cc.activeAlerts += 1
nodeAlarms[node.Url] = true
if !nodeAlarms[node.Url] {
cc.activeAlerts += 1
} else {
continue
}
nodeAlarms[node.Url] = true // used to keep active alert count correct
td.alert(
cc.name,
fmt.Sprintf("RPC node %s has been down for > %d minutes on %s", node.Url, td.NodeDownMin, cc.ChainId),
"critical",
false,
&node.Url,
)
} else if nodeAlarms[node.Url] && node.downSince.IsZero() {
} else if node.AlertIfDown && !node.down && node.wasDown {
// clear the alert
cc.activeAlerts -= 1
nodeAlarms[node.Url] = false
cc.activeAlerts -= 1
node.wasDown = false
td.alert(
cc.name,
fmt.Sprintf("RPC node %s has been down for > %d minutes on %s", node.Url, td.NodeDownMin, cc.ChainId),
Expand All @@ -485,11 +504,11 @@ func (cc *ChainConfig) watch() {

if td.Prom {
// raw block timer, ignoring finalized state
td.statsChan <- cc.mkUpdate(metricLastBlockSecondsNotFinal, time.Now().Sub(cc.lastBlockTime).Seconds(), "")
td.statsChan <- cc.mkUpdate(metricLastBlockSecondsNotFinal, time.Since(cc.lastBlockTime).Seconds(), "")
// update node-down times for prometheus
for _, node := range cc.Nodes {
if node.down && !node.downSince.IsZero() {
td.statsChan <- cc.mkUpdate(metricNodeDownSeconds, time.Now().Sub(node.downSince).Seconds(), node.Url)
td.statsChan <- cc.mkUpdate(metricNodeDownSeconds, time.Since(node.downSince).Seconds(), node.Url)
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion td2/chain-details.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ var altValopers = &valoperOverrides{
"ival": "ica", // Iris hub

// TODO: was told tgrade also has a custom prefix, but not sure what the pair is
//"tval": "tvalcons",
// "tval": "tvalcons",
},
}

Expand Down
10 changes: 7 additions & 3 deletions td2/dashboard/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ func Serve(port string, updates chan *ChainStatus, logs chan LogMessage, hideLog
log.Fatalln(err)
}
var cast broadcast.Broadcaster
defer cast.Discard()

// cache the json .... don't serialize on-demand
logCache, statusCache := []byte{'[', ']'}, []byte{'{', '}'}
Expand Down Expand Up @@ -115,7 +114,10 @@ func Serve(port string, updates chan *ChainStatus, logs chan LogMessage, hideLog
sub := cast.Listen()
defer sub.Discard()
for message := range sub.Channel() {
_ = c.WriteMessage(websocket.TextMessage, message.([]byte))
e := c.WriteMessage(websocket.TextMessage, message.([]byte))
if e != nil {
return
}
}
})

Expand All @@ -139,7 +141,9 @@ func Serve(port string, updates chan *ChainStatus, logs chan LogMessage, hideLog
})

http.Handle("/", &CacheHandler{})
log.Fatal("tenderduty - dashboard:", http.ListenAndServe(":"+port, nil))
err = http.ListenAndServe(":"+port, nil)
cast.Discard()
log.Fatal("tenderduty dashboard server failed", err)
}

// CacheHandler implements the Handler interface with a Cache-Control set on responses
Expand Down
1 change: 0 additions & 1 deletion td2/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ type promUpdate struct {
name string
chainId string
moniker string
blocknum string
endpoint string
}

Expand Down
10 changes: 4 additions & 6 deletions td2/rpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,16 +112,13 @@ func (cc *ChainConfig) newRpc() error {
Blocks: cc.blocksResults,
}
}
return errors.New("📵 no usable endpoints available for " + cc.ChainId)
return errors.New("no usable endpoints available for " + cc.ChainId)
}

func (cc *ChainConfig) monitorHealth(ctx context.Context, chainName string) {
tick := time.NewTicker(time.Minute)
if cc.client == nil {
e := cc.newRpc()
if e != nil {
l("💥", cc.ChainId, e)
}
_ = cc.newRpc()
}

for {
Expand All @@ -145,7 +142,7 @@ func (cc *ChainConfig) monitorHealth(ctx context.Context, chainName string) {
node.downSince = time.Now()
}
if td.Prom {
td.statsChan <- cc.mkUpdate(metricNodeDownSeconds, time.Now().Sub(node.downSince).Seconds(), node.Url)
td.statsChan <- cc.mkUpdate(metricNodeDownSeconds, time.Since(node.downSince).Seconds(), node.Url)
}
l("⚠️ " + node.lastMsg)
}
Expand Down Expand Up @@ -173,6 +170,7 @@ func (cc *ChainConfig) monitorHealth(ctx context.Context, chainName string) {
// node's OK, clear the note
if node.down {
node.lastMsg = ""
node.wasDown = true
}
td.statsChan <- cc.mkUpdate(metricNodeDownSeconds, 0, node.Url)
node.down = false
Expand Down
29 changes: 20 additions & 9 deletions td2/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ import (

var td = &Config{}

func Run(configFile, stateFile string, dumpConfig bool) error {
func Run(configFile, stateFile string) error {
var err error
td, err = loadConfig(configFile, stateFile, dumpConfig)
td, err = loadConfig(configFile, stateFile)
if err != nil {
return err
}
Expand Down Expand Up @@ -56,12 +56,12 @@ func Run(configFile, stateFile string, dumpConfig bool) error {
}()

if td.EnableDash {
l("starting dashboard on", td.Listen)
go dash.Serve(td.Listen, td.updateChan, td.logChan, td.HideLogs)
l("starting dashboard on", td.Listen)
} else {
go func() {
for {
_ = <-td.updateChan
<-td.updateChan
}
}()
}
Expand All @@ -70,7 +70,7 @@ func Run(configFile, stateFile string, dumpConfig bool) error {
} else {
go func() {
for {
_ = <-td.statsChan
<-td.statsChan
}
}()
}
Expand All @@ -85,7 +85,6 @@ func Run(configFile, stateFile string, dumpConfig bool) error {
// node health checks:
go func() {
for {
time.Sleep(time.Minute)
cc.monitorHealth(td.ctx, name)
}
}()
Expand Down Expand Up @@ -121,7 +120,7 @@ func Run(configFile, stateFile string, dumpConfig bool) error {

func saveOnExit(stateFile string, saved chan interface{}) {
quitting := make(chan os.Signal, 1)
signal.Notify(quitting, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP, syscall.SIGKILL)
signal.Notify(quitting, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP)

saveState := func() {
defer close(saved)
Expand All @@ -141,9 +140,21 @@ func saveOnExit(stateFile string, saved chan interface{}) {
blocks[k] = v.blocksResults
}
}
nodesDown := make(map[string]map[string]time.Time)
for k, v := range td.Chains {
for _, node := range v.Nodes {
if node.down {
if nodesDown[k] == nil {
nodesDown[k] = make(map[string]time.Time)
}
nodesDown[k][node.Url] = node.downSince
}
}
}
b, e := json.Marshal(&savedState{
Alarms: alarms,
Blocks: blocks,
Alarms: alarms,
Blocks: blocks,
NodesDown: nodesDown,
})
if e != nil {
log.Println(e)
Expand Down
1 change: 0 additions & 1 deletion td2/static/grid.js
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ function lightMode() {

function fix_dpi(id) {
let canvas = document.getElementById(id),
ctx = canvas.getContext('2d'),
dpi = window.devicePixelRatio;
gridH = h * dpi.valueOf()
gridW = w * dpi.valueOf()
Expand Down
Loading

0 comments on commit dfd932b

Please sign in to comment.