diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
index c458a80..5d849ce 100644
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -8,7 +8,7 @@ name: Docker
on:
push:
branches:
- - "release/*"
+ #- "release/*"
- main
# Publish semver tags as releases.
tags:
diff --git a/.gitignore b/.gitignore
index 4502584..7a0a870 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,9 @@
config.yml
docker-compose.yml
.tenderduty-state.json
+tenderduty-*
tenderduty
+local*
.idea
.DS_Store
diff --git a/docs/config.md b/docs/config.md
index ac7d799..c0c6aad 100644
--- a/docs/config.md
+++ b/docs/config.md
@@ -103,7 +103,7 @@ chains:
| `chain."name".alerts.percentage_missed` | What percentage should trigger the alert? |
| `chain."name".alerts.percentage_priority` | NOT USED: future hint for pagerduty's routing. |
| `chain."name".alerts.alert_if_inactive` | Should an alert be sent if the validator is not in the active set: jailed, tombstoned, or unbonding? |
-| `chain."name".alerts.alert_if_no_servers` | Should an alert be sent if no RPC servers are responding? (Note this alarm is instantaneous with no delay) |
+| `chain."name".alerts.alert_if_no_servers` | Should an alert be sent if no RPC servers are responding? (Note this alarm uses the node_down_alert_minutes setting) |
| `chain."name".alerts.pagerduty.*` | This section is the same as the pagerduty structure above. It allows disabling or enabling specific settings on a per-chain basis. Including routing to a different destination. If the api_key is blank it will use the settings defined in `pagerduty.*`
*Note both `pagerduty.enabled` and `chain."name".alerts.pagerduty.enabled` must be 'yes' to get alerts.* |
| `chain."name".alerts.discord.*` | This section is the same as the discord structure above. It allows disabling or enabling specific settings on a per-chain basis. Including routing to a different destination. If the webhook is blank it will use the settings defined in `discord.*`
*Note both `discord.enabled` and `chain."name".alerts.discord.enabled` must be 'yes' to get alerts.* |
| `chain."name".alerts.telegram.*` | This section is the same as the telegram structure above. It allows disabling or enabling specific settings on a per-chain basis. Including routing to a different destination. If the api_key and channel are blank it will use the settings defined in `telegram.*`
*Note both `telegram.enabled` and `chain."name".alerts.telegram.enabled` must be 'yes' to get alerts.* |
diff --git a/main.go b/main.go
index 85d2eb3..f0134ca 100644
--- a/main.go
+++ b/main.go
@@ -25,7 +25,7 @@ func main() {
os.Exit(0)
}
- err := td2.Run(configFile, stateFile, dumpConfig)
+ err := td2.Run(configFile, stateFile)
if err != nil {
log.Println(err.Error(), "... exiting.")
}
diff --git a/td2/alert.go b/td2/alert.go
index eae244b..a8d9db4 100644
--- a/td2/alert.go
+++ b/td2/alert.go
@@ -80,15 +80,22 @@ func shouldNotify(msg *alertMsg, dest notifyDest) bool {
whichMap = alarms.SentDiAlarms
service = "Discord"
}
- if !whichMap[msg.message].IsZero() && !msg.resolved {
+
+ switch {
+ case !whichMap[msg.message].IsZero() && !msg.resolved:
// already sent this alert
return false
- } else if !whichMap[msg.message].IsZero() && msg.resolved {
+ case !whichMap[msg.message].IsZero() && msg.resolved:
// alarm is cleared
delete(whichMap, msg.message)
l(fmt.Sprintf("💜 Resolved alarm on %s (%s) - notifying %s", msg.chain, msg.message, service))
return true
+ case msg.resolved:
+ // it looks like we got a duplicate resolution or suppressed it. Note it and move on:
+ l(fmt.Sprintf("😕 Not clearing alarm on %s (%s) - no corresponding alert %s", msg.chain, msg.message, service))
+ return false
}
+
// check if the alarm is flapping, if we sent the same alert in the last five minutes, show a warning but don't alert
if alarms.flappingAlarms[msg.chain] == nil {
alarms.flappingAlarms[msg.chain] = make(map[string]time.Time)
@@ -138,11 +145,6 @@ func notifyDiscord(msg *alertMsg) (err error) {
if resp.StatusCode != 204 {
log.Println(resp)
- //if resp.Body != nil {
- // b, _ := ioutil.ReadAll(resp.Body)
- // _ = resp.Body.Close()
- // fmt.Println(string(b))
- //}
l("notify discord:", err)
return err
}
@@ -184,7 +186,6 @@ func notifyTg(msg *alertMsg) (err error) {
if !shouldNotify(msg, tg) {
return nil
}
- //tgbotapi "github.com/go-telegram-bot-api/telegram-bot-api/v5"
bot, err := tgbotapi.NewBotAPI(msg.tgKey)
if err != nil {
l("notify telegram:", err)
@@ -197,7 +198,6 @@ func notifyTg(msg *alertMsg) (err error) {
}
mc := tgbotapi.NewMessageToChannel(msg.tgChannel, fmt.Sprintf("%s: %s - %s", msg.chain, prefix, msg.message))
- //mc.ParseMode = "html"
_, err = bot.Send(mc)
if err != nil {
l("telegram send:", err)
@@ -240,7 +240,6 @@ func getAlarms(chain string) string {
alarms.notifyMux.RLock()
defer alarms.notifyMux.RUnlock()
// don't show this info if the logs are disabled on the dashboard, potentially sensitive info could be leaked.
- //if td.HideLogs || currentAlarms[chain] == nil {
if td.HideLogs || alarms.AllAlarms[chain] == nil {
return ""
}
@@ -299,10 +298,11 @@ func (cc *ChainConfig) watch() {
nodeAlarms := make(map[string]bool)
// wait until we have a moniker:
+ noNodesSec := 0 // delay a no-nodes alarm for 30 seconds, too noisy.
for {
if cc.valInfo == nil || cc.valInfo.Moniker == "not connected" {
time.Sleep(time.Second)
- if cc.Alerts.AlertIfNoServers && !noNodes && cc.noNodes {
+ if cc.Alerts.AlertIfNoServers && !noNodes && cc.noNodes && noNodesSec >= 60*td.NodeDownMin {
noNodes = true
td.alert(
cc.name,
@@ -312,8 +312,10 @@ func (cc *ChainConfig) watch() {
&cc.valInfo.Valcons,
)
}
+ noNodesSec += 1
continue
}
+ noNodesSec = 0
break
}
// initial stat creation for nodes, we only update again if the node is positive
@@ -327,16 +329,26 @@ func (cc *ChainConfig) watch() {
time.Sleep(2 * time.Second)
// alert if we can't monitor
- if cc.Alerts.AlertIfNoServers && !noNodes && cc.noNodes {
- noNodes = true
- td.alert(
- cc.name,
- fmt.Sprintf("no RPC endpoints are working for %s", cc.ChainId),
- "critical",
- false,
- &cc.valInfo.Valcons,
- )
- } else if cc.Alerts.AlertIfNoServers && noNodes && !cc.noNodes {
+ switch {
+ case cc.Alerts.AlertIfNoServers && !noNodes && cc.noNodes:
+ noNodesSec += 2
+ if noNodesSec <= 30*td.NodeDownMin {
+ if noNodesSec%20 == 0 {
+ l(fmt.Sprintf("no nodes available on %s for %d seconds, deferring alarm", cc.ChainId, noNodesSec))
+ }
+ noNodes = false
+ } else {
+ noNodesSec = 0
+ noNodes = true
+ td.alert(
+ cc.name,
+ fmt.Sprintf("no RPC endpoints are working for %s", cc.ChainId),
+ "critical",
+ false,
+ &cc.valInfo.Valcons,
+ )
+ }
+ case cc.Alerts.AlertIfNoServers && noNodes && !cc.noNodes:
noNodes = false
td.alert(
cc.name,
@@ -345,6 +357,8 @@ func (cc *ChainConfig) watch() {
true,
&cc.valInfo.Valcons,
)
+ default:
+ noNodesSec = 0
}
// stalled chain detection
@@ -428,7 +442,6 @@ func (cc *ChainConfig) watch() {
}
// window percentage missed block alarms
- //fmt.Println(100*float64(cc.valInfo.Missed)/float64(cc.valInfo.Window), float64(cc.Alerts.Window))
if cc.Alerts.PercentageAlerts && !pctAlarm && 100*float64(cc.valInfo.Missed)/float64(cc.valInfo.Window) > float64(cc.Alerts.Window) {
// alert on missed block counter!
pctAlarm = true
@@ -458,10 +471,15 @@ func (cc *ChainConfig) watch() {
// node down alarms
for _, node := range cc.Nodes {
// window percentage missed block alarms
- if node.AlertIfDown && node.down && !nodeAlarms[node.Url] && !node.downSince.IsZero() && time.Now().Sub(node.downSince).Minutes() > float64(td.NodeDownMin) {
+ if node.AlertIfDown && node.down && !node.wasDown && !node.downSince.IsZero() &&
+ time.Since(node.downSince) > time.Duration(td.NodeDownMin)*time.Minute {
// alert on dead node
- cc.activeAlerts += 1
- nodeAlarms[node.Url] = true
+ if !nodeAlarms[node.Url] {
+ cc.activeAlerts += 1
+ } else {
+ continue
+ }
+ nodeAlarms[node.Url] = true // used to keep active alert count correct
td.alert(
cc.name,
fmt.Sprintf("RPC node %s has been down for > %d minutes on %s", node.Url, td.NodeDownMin, cc.ChainId),
@@ -469,10 +487,11 @@ func (cc *ChainConfig) watch() {
false,
&node.Url,
)
- } else if nodeAlarms[node.Url] && node.downSince.IsZero() {
+ } else if node.AlertIfDown && !node.down && node.wasDown {
// clear the alert
- cc.activeAlerts -= 1
nodeAlarms[node.Url] = false
+ cc.activeAlerts -= 1
+ node.wasDown = false
td.alert(
cc.name,
fmt.Sprintf("RPC node %s has been down for > %d minutes on %s", node.Url, td.NodeDownMin, cc.ChainId),
@@ -485,11 +504,11 @@ func (cc *ChainConfig) watch() {
if td.Prom {
// raw block timer, ignoring finalized state
- td.statsChan <- cc.mkUpdate(metricLastBlockSecondsNotFinal, time.Now().Sub(cc.lastBlockTime).Seconds(), "")
+ td.statsChan <- cc.mkUpdate(metricLastBlockSecondsNotFinal, time.Since(cc.lastBlockTime).Seconds(), "")
// update node-down times for prometheus
for _, node := range cc.Nodes {
if node.down && !node.downSince.IsZero() {
- td.statsChan <- cc.mkUpdate(metricNodeDownSeconds, time.Now().Sub(node.downSince).Seconds(), node.Url)
+ td.statsChan <- cc.mkUpdate(metricNodeDownSeconds, time.Since(node.downSince).Seconds(), node.Url)
}
}
}
diff --git a/td2/chain-details.go b/td2/chain-details.go
index b49dba9..967534b 100644
--- a/td2/chain-details.go
+++ b/td2/chain-details.go
@@ -16,7 +16,7 @@ var altValopers = &valoperOverrides{
"ival": "ica", // Iris hub
// TODO: was told tgrade also has a custom prefix, but not sure what the pair is
- //"tval": "tvalcons",
+ // "tval": "tvalcons",
},
}
diff --git a/td2/dashboard/server.go b/td2/dashboard/server.go
index 80bedb8..12a97b5 100644
--- a/td2/dashboard/server.go
+++ b/td2/dashboard/server.go
@@ -29,7 +29,6 @@ func Serve(port string, updates chan *ChainStatus, logs chan LogMessage, hideLog
log.Fatalln(err)
}
var cast broadcast.Broadcaster
- defer cast.Discard()
// cache the json .... don't serialize on-demand
logCache, statusCache := []byte{'[', ']'}, []byte{'{', '}'}
@@ -115,7 +114,10 @@ func Serve(port string, updates chan *ChainStatus, logs chan LogMessage, hideLog
sub := cast.Listen()
defer sub.Discard()
for message := range sub.Channel() {
- _ = c.WriteMessage(websocket.TextMessage, message.([]byte))
+ e := c.WriteMessage(websocket.TextMessage, message.([]byte))
+ if e != nil {
+ return
+ }
}
})
@@ -139,7 +141,9 @@ func Serve(port string, updates chan *ChainStatus, logs chan LogMessage, hideLog
})
http.Handle("/", &CacheHandler{})
- log.Fatal("tenderduty - dashboard:", http.ListenAndServe(":"+port, nil))
+ err = http.ListenAndServe(":"+port, nil)
+ cast.Discard()
+ log.Fatal("tenderduty dashboard server failed", err)
}
// CacheHandler implements the Handler interface with a Cache-Control set on responses
diff --git a/td2/prometheus.go b/td2/prometheus.go
index e3b573c..30236ca 100644
--- a/td2/prometheus.go
+++ b/td2/prometheus.go
@@ -42,7 +42,6 @@ type promUpdate struct {
name string
chainId string
moniker string
- blocknum string
endpoint string
}
diff --git a/td2/rpc.go b/td2/rpc.go
index 0849697..0f73354 100644
--- a/td2/rpc.go
+++ b/td2/rpc.go
@@ -112,16 +112,13 @@ func (cc *ChainConfig) newRpc() error {
Blocks: cc.blocksResults,
}
}
- return errors.New("📵 no usable endpoints available for " + cc.ChainId)
+ return errors.New("no usable endpoints available for " + cc.ChainId)
}
func (cc *ChainConfig) monitorHealth(ctx context.Context, chainName string) {
tick := time.NewTicker(time.Minute)
if cc.client == nil {
- e := cc.newRpc()
- if e != nil {
- l("💥", cc.ChainId, e)
- }
+ _ = cc.newRpc()
}
for {
@@ -145,7 +142,7 @@ func (cc *ChainConfig) monitorHealth(ctx context.Context, chainName string) {
node.downSince = time.Now()
}
if td.Prom {
- td.statsChan <- cc.mkUpdate(metricNodeDownSeconds, time.Now().Sub(node.downSince).Seconds(), node.Url)
+ td.statsChan <- cc.mkUpdate(metricNodeDownSeconds, time.Since(node.downSince).Seconds(), node.Url)
}
l("⚠️ " + node.lastMsg)
}
@@ -173,6 +170,7 @@ func (cc *ChainConfig) monitorHealth(ctx context.Context, chainName string) {
// node's OK, clear the note
if node.down {
node.lastMsg = ""
+ node.wasDown = true
}
td.statsChan <- cc.mkUpdate(metricNodeDownSeconds, 0, node.Url)
node.down = false
diff --git a/td2/run.go b/td2/run.go
index 393cf66..2749ae8 100644
--- a/td2/run.go
+++ b/td2/run.go
@@ -13,9 +13,9 @@ import (
var td = &Config{}
-func Run(configFile, stateFile string, dumpConfig bool) error {
+func Run(configFile, stateFile string) error {
var err error
- td, err = loadConfig(configFile, stateFile, dumpConfig)
+ td, err = loadConfig(configFile, stateFile)
if err != nil {
return err
}
@@ -56,12 +56,12 @@ func Run(configFile, stateFile string, dumpConfig bool) error {
}()
if td.EnableDash {
- l("starting dashboard on", td.Listen)
go dash.Serve(td.Listen, td.updateChan, td.logChan, td.HideLogs)
+ l("starting dashboard on", td.Listen)
} else {
go func() {
for {
- _ = <-td.updateChan
+ <-td.updateChan
}
}()
}
@@ -70,7 +70,7 @@ func Run(configFile, stateFile string, dumpConfig bool) error {
} else {
go func() {
for {
- _ = <-td.statsChan
+ <-td.statsChan
}
}()
}
@@ -85,7 +85,6 @@ func Run(configFile, stateFile string, dumpConfig bool) error {
// node health checks:
go func() {
for {
- time.Sleep(time.Minute)
cc.monitorHealth(td.ctx, name)
}
}()
@@ -121,7 +120,7 @@ func Run(configFile, stateFile string, dumpConfig bool) error {
func saveOnExit(stateFile string, saved chan interface{}) {
quitting := make(chan os.Signal, 1)
- signal.Notify(quitting, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP, syscall.SIGKILL)
+ signal.Notify(quitting, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP)
saveState := func() {
defer close(saved)
@@ -141,9 +140,21 @@ func saveOnExit(stateFile string, saved chan interface{}) {
blocks[k] = v.blocksResults
}
}
+ nodesDown := make(map[string]map[string]time.Time)
+ for k, v := range td.Chains {
+ for _, node := range v.Nodes {
+ if node.down {
+ if nodesDown[k] == nil {
+ nodesDown[k] = make(map[string]time.Time)
+ }
+ nodesDown[k][node.Url] = node.downSince
+ }
+ }
+ }
b, e := json.Marshal(&savedState{
- Alarms: alarms,
- Blocks: blocks,
+ Alarms: alarms,
+ Blocks: blocks,
+ NodesDown: nodesDown,
})
if e != nil {
log.Println(e)
diff --git a/td2/static/grid.js b/td2/static/grid.js
index 7e6990d..7bd05d3 100644
--- a/td2/static/grid.js
+++ b/td2/static/grid.js
@@ -36,7 +36,6 @@ function lightMode() {
function fix_dpi(id) {
let canvas = document.getElementById(id),
- ctx = canvas.getContext('2d'),
dpi = window.devicePixelRatio;
gridH = h * dpi.valueOf()
gridW = w * dpi.valueOf()
diff --git a/td2/static/index.html b/td2/static/index.html
index 636e5ed..7b2425a 100644
--- a/td2/static/index.html
+++ b/td2/static/index.html
@@ -1,5 +1,5 @@
-
+