diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index c458a80..5d849ce 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -8,7 +8,7 @@ name: Docker on: push: branches: - - "release/*" + #- "release/*" - main # Publish semver tags as releases. tags: diff --git a/.gitignore b/.gitignore index 4502584..7a0a870 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,9 @@ config.yml docker-compose.yml .tenderduty-state.json +tenderduty-* tenderduty +local* .idea .DS_Store diff --git a/docs/config.md b/docs/config.md index ac7d799..c0c6aad 100644 --- a/docs/config.md +++ b/docs/config.md @@ -103,7 +103,7 @@ chains: | `chain."name".alerts.percentage_missed` | What percentage should trigger the alert? | | `chain."name".alerts.percentage_priority` | NOT USED: future hint for pagerduty's routing. | | `chain."name".alerts.alert_if_inactive` | Should an alert be sent if the validator is not in the active set: jailed, tombstoned, or unbonding? | -| `chain."name".alerts.alert_if_no_servers` | Should an alert be sent if no RPC servers are responding? (Note this alarm is instantaneous with no delay) | +| `chain."name".alerts.alert_if_no_servers` | Should an alert be sent if no RPC servers are responding? (Note this alarm uses the node_down_alert_minutes setting) | | `chain."name".alerts.pagerduty.*` | This section is the same as the pagerduty structure above. It allows disabling or enabling specific settings on a per-chain basis. Including routing to a different destination. If the api_key is blank it will use the settings defined in `pagerduty.*`
*Note both `pagerduty.enabled` and `chain."name".alerts.pagerduty.enabled` must be 'yes' to get alerts.* | | `chain."name".alerts.discord.*` | This section is the same as the discord structure above. It allows disabling or enabling specific settings on a per-chain basis. Including routing to a different destination. If the webhook is blank it will use the settings defined in `discord.*`
*Note both `discord.enabled` and `chain."name".alerts.discord.enabled` must be 'yes' to get alerts.* | | `chain."name".alerts.telegram.*` | This section is the same as the telegram structure above. It allows disabling or enabling specific settings on a per-chain basis. Including routing to a different destination. If the api_key and channel are blank it will use the settings defined in `telegram.*`
*Note both `telegram.enabled` and `chain."name".alerts.telegram.enabled` must be 'yes' to get alerts.* | diff --git a/main.go b/main.go index 85d2eb3..f0134ca 100644 --- a/main.go +++ b/main.go @@ -25,7 +25,7 @@ func main() { os.Exit(0) } - err := td2.Run(configFile, stateFile, dumpConfig) + err := td2.Run(configFile, stateFile) if err != nil { log.Println(err.Error(), "... exiting.") } diff --git a/td2/alert.go b/td2/alert.go index eae244b..a8d9db4 100644 --- a/td2/alert.go +++ b/td2/alert.go @@ -80,15 +80,22 @@ func shouldNotify(msg *alertMsg, dest notifyDest) bool { whichMap = alarms.SentDiAlarms service = "Discord" } - if !whichMap[msg.message].IsZero() && !msg.resolved { + + switch { + case !whichMap[msg.message].IsZero() && !msg.resolved: // already sent this alert return false - } else if !whichMap[msg.message].IsZero() && msg.resolved { + case !whichMap[msg.message].IsZero() && msg.resolved: // alarm is cleared delete(whichMap, msg.message) l(fmt.Sprintf("💜 Resolved alarm on %s (%s) - notifying %s", msg.chain, msg.message, service)) return true + case msg.resolved: + // it looks like we got a duplicate resolution or suppressed it. Note it and move on: + l(fmt.Sprintf("😕 Not clearing alarm on %s (%s) - no corresponding alert %s", msg.chain, msg.message, service)) + return false } + // check if the alarm is flapping, if we sent the same alert in the last five minutes, show a warning but don't alert if alarms.flappingAlarms[msg.chain] == nil { alarms.flappingAlarms[msg.chain] = make(map[string]time.Time) @@ -138,11 +145,6 @@ func notifyDiscord(msg *alertMsg) (err error) { if resp.StatusCode != 204 { log.Println(resp) - //if resp.Body != nil { - // b, _ := ioutil.ReadAll(resp.Body) - // _ = resp.Body.Close() - // fmt.Println(string(b)) - //} l("notify discord:", err) return err } @@ -184,7 +186,6 @@ func notifyTg(msg *alertMsg) (err error) { if !shouldNotify(msg, tg) { return nil } - //tgbotapi "github.com/go-telegram-bot-api/telegram-bot-api/v5" bot, err := tgbotapi.NewBotAPI(msg.tgKey) if err != nil { l("notify telegram:", err) @@ -197,7 +198,6 @@ func notifyTg(msg *alertMsg) (err error) { } mc := tgbotapi.NewMessageToChannel(msg.tgChannel, fmt.Sprintf("%s: %s - %s", msg.chain, prefix, msg.message)) - //mc.ParseMode = "html" _, err = bot.Send(mc) if err != nil { l("telegram send:", err) @@ -240,7 +240,6 @@ func getAlarms(chain string) string { alarms.notifyMux.RLock() defer alarms.notifyMux.RUnlock() // don't show this info if the logs are disabled on the dashboard, potentially sensitive info could be leaked. - //if td.HideLogs || currentAlarms[chain] == nil { if td.HideLogs || alarms.AllAlarms[chain] == nil { return "" } @@ -299,10 +298,11 @@ func (cc *ChainConfig) watch() { nodeAlarms := make(map[string]bool) // wait until we have a moniker: + noNodesSec := 0 // delay a no-nodes alarm for 30 seconds, too noisy. for { if cc.valInfo == nil || cc.valInfo.Moniker == "not connected" { time.Sleep(time.Second) - if cc.Alerts.AlertIfNoServers && !noNodes && cc.noNodes { + if cc.Alerts.AlertIfNoServers && !noNodes && cc.noNodes && noNodesSec >= 60*td.NodeDownMin { noNodes = true td.alert( cc.name, @@ -312,8 +312,10 @@ func (cc *ChainConfig) watch() { &cc.valInfo.Valcons, ) } + noNodesSec += 1 continue } + noNodesSec = 0 break } // initial stat creation for nodes, we only update again if the node is positive @@ -327,16 +329,26 @@ func (cc *ChainConfig) watch() { time.Sleep(2 * time.Second) // alert if we can't monitor - if cc.Alerts.AlertIfNoServers && !noNodes && cc.noNodes { - noNodes = true - td.alert( - cc.name, - fmt.Sprintf("no RPC endpoints are working for %s", cc.ChainId), - "critical", - false, - &cc.valInfo.Valcons, - ) - } else if cc.Alerts.AlertIfNoServers && noNodes && !cc.noNodes { + switch { + case cc.Alerts.AlertIfNoServers && !noNodes && cc.noNodes: + noNodesSec += 2 + if noNodesSec <= 30*td.NodeDownMin { + if noNodesSec%20 == 0 { + l(fmt.Sprintf("no nodes available on %s for %d seconds, deferring alarm", cc.ChainId, noNodesSec)) + } + noNodes = false + } else { + noNodesSec = 0 + noNodes = true + td.alert( + cc.name, + fmt.Sprintf("no RPC endpoints are working for %s", cc.ChainId), + "critical", + false, + &cc.valInfo.Valcons, + ) + } + case cc.Alerts.AlertIfNoServers && noNodes && !cc.noNodes: noNodes = false td.alert( cc.name, @@ -345,6 +357,8 @@ func (cc *ChainConfig) watch() { true, &cc.valInfo.Valcons, ) + default: + noNodesSec = 0 } // stalled chain detection @@ -428,7 +442,6 @@ func (cc *ChainConfig) watch() { } // window percentage missed block alarms - //fmt.Println(100*float64(cc.valInfo.Missed)/float64(cc.valInfo.Window), float64(cc.Alerts.Window)) if cc.Alerts.PercentageAlerts && !pctAlarm && 100*float64(cc.valInfo.Missed)/float64(cc.valInfo.Window) > float64(cc.Alerts.Window) { // alert on missed block counter! pctAlarm = true @@ -458,10 +471,15 @@ func (cc *ChainConfig) watch() { // node down alarms for _, node := range cc.Nodes { // window percentage missed block alarms - if node.AlertIfDown && node.down && !nodeAlarms[node.Url] && !node.downSince.IsZero() && time.Now().Sub(node.downSince).Minutes() > float64(td.NodeDownMin) { + if node.AlertIfDown && node.down && !node.wasDown && !node.downSince.IsZero() && + time.Since(node.downSince) > time.Duration(td.NodeDownMin)*time.Minute { // alert on dead node - cc.activeAlerts += 1 - nodeAlarms[node.Url] = true + if !nodeAlarms[node.Url] { + cc.activeAlerts += 1 + } else { + continue + } + nodeAlarms[node.Url] = true // used to keep active alert count correct td.alert( cc.name, fmt.Sprintf("RPC node %s has been down for > %d minutes on %s", node.Url, td.NodeDownMin, cc.ChainId), @@ -469,10 +487,11 @@ func (cc *ChainConfig) watch() { false, &node.Url, ) - } else if nodeAlarms[node.Url] && node.downSince.IsZero() { + } else if node.AlertIfDown && !node.down && node.wasDown { // clear the alert - cc.activeAlerts -= 1 nodeAlarms[node.Url] = false + cc.activeAlerts -= 1 + node.wasDown = false td.alert( cc.name, fmt.Sprintf("RPC node %s has been down for > %d minutes on %s", node.Url, td.NodeDownMin, cc.ChainId), @@ -485,11 +504,11 @@ func (cc *ChainConfig) watch() { if td.Prom { // raw block timer, ignoring finalized state - td.statsChan <- cc.mkUpdate(metricLastBlockSecondsNotFinal, time.Now().Sub(cc.lastBlockTime).Seconds(), "") + td.statsChan <- cc.mkUpdate(metricLastBlockSecondsNotFinal, time.Since(cc.lastBlockTime).Seconds(), "") // update node-down times for prometheus for _, node := range cc.Nodes { if node.down && !node.downSince.IsZero() { - td.statsChan <- cc.mkUpdate(metricNodeDownSeconds, time.Now().Sub(node.downSince).Seconds(), node.Url) + td.statsChan <- cc.mkUpdate(metricNodeDownSeconds, time.Since(node.downSince).Seconds(), node.Url) } } } diff --git a/td2/chain-details.go b/td2/chain-details.go index b49dba9..967534b 100644 --- a/td2/chain-details.go +++ b/td2/chain-details.go @@ -16,7 +16,7 @@ var altValopers = &valoperOverrides{ "ival": "ica", // Iris hub // TODO: was told tgrade also has a custom prefix, but not sure what the pair is - //"tval": "tvalcons", + // "tval": "tvalcons", }, } diff --git a/td2/dashboard/server.go b/td2/dashboard/server.go index 80bedb8..12a97b5 100644 --- a/td2/dashboard/server.go +++ b/td2/dashboard/server.go @@ -29,7 +29,6 @@ func Serve(port string, updates chan *ChainStatus, logs chan LogMessage, hideLog log.Fatalln(err) } var cast broadcast.Broadcaster - defer cast.Discard() // cache the json .... don't serialize on-demand logCache, statusCache := []byte{'[', ']'}, []byte{'{', '}'} @@ -115,7 +114,10 @@ func Serve(port string, updates chan *ChainStatus, logs chan LogMessage, hideLog sub := cast.Listen() defer sub.Discard() for message := range sub.Channel() { - _ = c.WriteMessage(websocket.TextMessage, message.([]byte)) + e := c.WriteMessage(websocket.TextMessage, message.([]byte)) + if e != nil { + return + } } }) @@ -139,7 +141,9 @@ func Serve(port string, updates chan *ChainStatus, logs chan LogMessage, hideLog }) http.Handle("/", &CacheHandler{}) - log.Fatal("tenderduty - dashboard:", http.ListenAndServe(":"+port, nil)) + err = http.ListenAndServe(":"+port, nil) + cast.Discard() + log.Fatal("tenderduty dashboard server failed", err) } // CacheHandler implements the Handler interface with a Cache-Control set on responses diff --git a/td2/prometheus.go b/td2/prometheus.go index e3b573c..30236ca 100644 --- a/td2/prometheus.go +++ b/td2/prometheus.go @@ -42,7 +42,6 @@ type promUpdate struct { name string chainId string moniker string - blocknum string endpoint string } diff --git a/td2/rpc.go b/td2/rpc.go index 0849697..0f73354 100644 --- a/td2/rpc.go +++ b/td2/rpc.go @@ -112,16 +112,13 @@ func (cc *ChainConfig) newRpc() error { Blocks: cc.blocksResults, } } - return errors.New("📵 no usable endpoints available for " + cc.ChainId) + return errors.New("no usable endpoints available for " + cc.ChainId) } func (cc *ChainConfig) monitorHealth(ctx context.Context, chainName string) { tick := time.NewTicker(time.Minute) if cc.client == nil { - e := cc.newRpc() - if e != nil { - l("💥", cc.ChainId, e) - } + _ = cc.newRpc() } for { @@ -145,7 +142,7 @@ func (cc *ChainConfig) monitorHealth(ctx context.Context, chainName string) { node.downSince = time.Now() } if td.Prom { - td.statsChan <- cc.mkUpdate(metricNodeDownSeconds, time.Now().Sub(node.downSince).Seconds(), node.Url) + td.statsChan <- cc.mkUpdate(metricNodeDownSeconds, time.Since(node.downSince).Seconds(), node.Url) } l("⚠️ " + node.lastMsg) } @@ -173,6 +170,7 @@ func (cc *ChainConfig) monitorHealth(ctx context.Context, chainName string) { // node's OK, clear the note if node.down { node.lastMsg = "" + node.wasDown = true } td.statsChan <- cc.mkUpdate(metricNodeDownSeconds, 0, node.Url) node.down = false diff --git a/td2/run.go b/td2/run.go index 393cf66..2749ae8 100644 --- a/td2/run.go +++ b/td2/run.go @@ -13,9 +13,9 @@ import ( var td = &Config{} -func Run(configFile, stateFile string, dumpConfig bool) error { +func Run(configFile, stateFile string) error { var err error - td, err = loadConfig(configFile, stateFile, dumpConfig) + td, err = loadConfig(configFile, stateFile) if err != nil { return err } @@ -56,12 +56,12 @@ func Run(configFile, stateFile string, dumpConfig bool) error { }() if td.EnableDash { - l("starting dashboard on", td.Listen) go dash.Serve(td.Listen, td.updateChan, td.logChan, td.HideLogs) + l("starting dashboard on", td.Listen) } else { go func() { for { - _ = <-td.updateChan + <-td.updateChan } }() } @@ -70,7 +70,7 @@ func Run(configFile, stateFile string, dumpConfig bool) error { } else { go func() { for { - _ = <-td.statsChan + <-td.statsChan } }() } @@ -85,7 +85,6 @@ func Run(configFile, stateFile string, dumpConfig bool) error { // node health checks: go func() { for { - time.Sleep(time.Minute) cc.monitorHealth(td.ctx, name) } }() @@ -121,7 +120,7 @@ func Run(configFile, stateFile string, dumpConfig bool) error { func saveOnExit(stateFile string, saved chan interface{}) { quitting := make(chan os.Signal, 1) - signal.Notify(quitting, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP, syscall.SIGKILL) + signal.Notify(quitting, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP) saveState := func() { defer close(saved) @@ -141,9 +140,21 @@ func saveOnExit(stateFile string, saved chan interface{}) { blocks[k] = v.blocksResults } } + nodesDown := make(map[string]map[string]time.Time) + for k, v := range td.Chains { + for _, node := range v.Nodes { + if node.down { + if nodesDown[k] == nil { + nodesDown[k] = make(map[string]time.Time) + } + nodesDown[k][node.Url] = node.downSince + } + } + } b, e := json.Marshal(&savedState{ - Alarms: alarms, - Blocks: blocks, + Alarms: alarms, + Blocks: blocks, + NodesDown: nodesDown, }) if e != nil { log.Println(e) diff --git a/td2/static/grid.js b/td2/static/grid.js index 7e6990d..7bd05d3 100644 --- a/td2/static/grid.js +++ b/td2/static/grid.js @@ -36,7 +36,6 @@ function lightMode() { function fix_dpi(id) { let canvas = document.getElementById(id), - ctx = canvas.getContext('2d'), dpi = window.devicePixelRatio; gridH = h * dpi.valueOf() gridW = w * dpi.valueOf() diff --git a/td2/static/index.html b/td2/static/index.html index 636e5ed..7b2425a 100644 --- a/td2/static/index.html +++ b/td2/static/index.html @@ -1,5 +1,5 @@ - +