1
1
package alerts
2
2
3
3
import (
4
+ "fmt"
5
+ "strings"
6
+
4
7
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
5
8
"k8s.io/apimachinery/pkg/util/intstr"
6
9
"k8s.io/utils/ptr"
7
10
)
8
11
12
+ var ignoredInterfacesForNetworkDown = []string {
13
+ "lo" , // loopback interface
14
+ "tunbr" , // tunnel bridge
15
+ "veth.+" , // virtual ethernet devices
16
+ "ovs-system" , // OVS internal system interface
17
+ "genev_sys.+" , // OVN Geneve overlay/encapsulation interfaces
18
+ "br-int" , // OVN integration bridge
19
+ }
20
+
9
21
func clusterAlerts () []promv1.Rule {
10
22
return []promv1.Rule {
11
23
{
@@ -23,7 +35,7 @@ func clusterAlerts() []promv1.Rule {
23
35
},
24
36
{
25
37
Alert : "HAControlPlaneDown" ,
26
- Expr : intstr .FromString ("kube_node_role{role=\" control-plane\" } * on(node) kube_node_status_condition{condition=\" Ready\" ,status=\" true\" } == 0" ),
38
+ Expr : intstr .FromString ("kube_node_role{role=' control-plane' } * on(node) kube_node_status_condition{condition=' Ready' ,status=' true' } == 0" ),
27
39
For : ptr .To (promv1 .Duration ("5m" )),
28
40
Annotations : map [string ]string {
29
41
"summary" : "Control plane node {{ $labels.node }} is not ready" ,
@@ -36,7 +48,7 @@ func clusterAlerts() []promv1.Rule {
36
48
},
37
49
{
38
50
Alert : "NodeNetworkInterfaceDown" ,
39
- Expr : intstr .FromString ("count by (instance) (node_network_up{device!~\" veth.+|tunbr \" } == 0) > 0" ),
51
+ Expr : intstr .FromString (fmt . Sprintf ( "count by (instance) (node_network_up{device!~'%s' } == 0) > 0" , strings . Join ( ignoredInterfacesForNetworkDown , "|" )) ),
40
52
For : ptr .To (promv1 .Duration ("5m" )),
41
53
Annotations : map [string ]string {
42
54
"summary" : "Network interfaces are down" ,
0 commit comments