diff --git a/grctl/cmd/node.go b/grctl/cmd/node.go index f42e678bfa..9c9d95bdb9 100644 --- a/grctl/cmd/node.go +++ b/grctl/cmd/node.go @@ -216,7 +216,7 @@ func NewCmdNode() cli.Command { table.AddRow("down", v.DownTime) table.AddRow("connected", v.Connected) fmt.Println(table) - fmt.Printf("-------------------ervice health-----------------------\n") + fmt.Printf("-------------------service health-----------------------\n") serviceTable := termtables.CreateTable() serviceTable.AddHeaders("Title", "Result", "Message") extractReady(serviceTable, v, "Ready") diff --git a/monitor/prometheus/rules_manager.go b/monitor/prometheus/rules_manager.go index c9ef90b2bf..03e8192edf 100644 --- a/monitor/prometheus/rules_manager.go +++ b/monitor/prometheus/rules_manager.go @@ -6,7 +6,6 @@ import ( "gopkg.in/yaml.v2" "os" "github.com/goodrain/rainbond/cmd/monitor/option" - ) type AlertingRulesConfig struct { @@ -28,26 +27,13 @@ type RulesConfig struct { type AlertingRulesManager struct { RulesConfig *AlertingRulesConfig - config *option.Config + config *option.Config } func NewRulesManager(config *option.Config) *AlertingRulesManager { a := &AlertingRulesManager{ RulesConfig: &AlertingRulesConfig{ Groups: []*AlertingNameConfig{ - &AlertingNameConfig{ - - Name: "InstanceHealth", - Rules: []*RulesConfig{ - &RulesConfig{ - Alert: "InstanceDown", - Expr: "up == 0", - For: "3m", - Labels: map[string]string{}, - Annotations: map[string]string{"summary": "builder {{$labels.instance}} down", "description":"{{$labels.instance}} of job {{$labels.job}} has been down for more than 3 minutes"}, - }, - }, - }, &AlertingNameConfig{ Name: "BuilderHealth", @@ -161,6 +147,47 @@ func NewRulesManager(config *option.Config) *AlertingRulesManager { }, }, }, + &AlertingNameConfig{ + + Name: "NodeHealth", + Rules: []*RulesConfig{ + &RulesConfig{ + Alert: "high_cpu_usage_on_node", + Expr: "sum by(instance) (rate(process_cpu_seconds_total[5m])) * 100 > 70", + For: "5m", + Labels: map[string]string{"service": "node_cpu"}, + Annotations: map[string]string{"description": "{{ $labels.instance }} is using a LOT of CPU. CPU usage is {{ humanize $value}}%.", "summary": "HIGH CPU USAGE WARNING ON '{{ $labels.instance }}'"}, + }, + &RulesConfig{ + Alert: "high_la_usage_on_node", + Expr: "node_load5 > 5", + For: "5m", + Labels: map[string]string{"service": "node_load5"}, + Annotations: map[string]string{"description": "{{ $labels.instance }} has a high load average. Load Average 5m is {{ humanize $value}}.", "summary": "HIGH LOAD AVERAGE WARNING ON '{{ $labels.instance }}'"}, + }, + &RulesConfig{ + Alert: "node_running_out_of_disk_space", + Expr: "(node_filesystem_size{mountpoint='/'} - node_filesystem_free{mountpoint='/'}) * 100 / node_filesystem_size{mountpoint='/'} > 80", + For: "5m", + Labels: map[string]string{"service": "node_running_out_of_disk_space"}, + Annotations: map[string]string{"description": "More than 80% of disk used. Disk usage {{ humanize $value }}%.", "summary": "LOW DISK SPACE WARING:NODE '{{ $labels.instance }}"}, + }, + &RulesConfig{ + Alert: "monitoring_service_down", + Expr: "up == 0", + For: "5m", + Labels: map[string]string{"service": "service_down"}, + Annotations: map[string]string{"description": "The monitoring service '{{ $labels.job }}' is down.", "summary": "MONITORING SERVICE DOWN WARNING:NODE '{{ $labels.instance }}'"}, + }, + &RulesConfig{ + Alert: "high_memory_usage_on_node", + Expr: "((node_memory_MemTotal - node_memory_MemAvailable) / node_memory_MemTotal) * 100 > 80", + For: "5m", + Labels: map[string]string{"service": "node_memory"}, + Annotations: map[string]string{"description": "{{ $labels.instance }} is using a LOT of MEMORY. MEMORY usage is over {{ humanize $value}}%.", "summary": "HIGH MEMORY USAGE WARNING TASK ON '{{ $labels.instance }}'"}, + }, + }, + }, }, }, config: config,