diff --git a/operator/assets/alertmanager/alertmanager.yaml b/operator/assets/alertmanager/alertmanager.yaml index 53bc671..b11fdbd 100644 --- a/operator/assets/alertmanager/alertmanager.yaml +++ b/operator/assets/alertmanager/alertmanager.yaml @@ -33,7 +33,7 @@ receivers: {{ range .Alerts }} *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` *Description:* {{ .Annotations.description }} - *Graph:* <{{ .GeneratorURL }}|:chart_with_upwards_trend:> + *Graph:* <{{ .GeneratorURL }}|:chart_with_upwards_trend:> *Runbook:* <{{ .Annotations.runbook }}|:spiral_note_pad:> *Details:* {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` {{ end }} diff --git a/operator/assets/prometheus/rules/custom.rules b/operator/assets/prometheus/rules/custom.rules index 01e05fe..e94551e 100644 --- a/operator/assets/prometheus/rules/custom.rules +++ b/operator/assets/prometheus/rules/custom.rules @@ -7,21 +7,31 @@ groups: labels: severity: critical annotations: - summary: 'Deployment of {{$labels.exported_namespace}}/{{$labels.name}} failed' - description: 'Deployment of {{$labels.exported_namespace}}/{{$labels.name}} failed - observed generation != intended generation' + summary: Deployment of {{$labels.namespace}}/{{$labels.name}} failed + description: Deployment of {{$labels.namespace}}/{{$labels.name}} failed - observed generation != intended generation - alert: DeploymentReplicasMismatch expr: kube_deployment_spec_replicas{job="kube-state-metrics"} != kube_deployment_status_availableReplicas{job="kube-state-metrics"} for: 5m labels: severity: critical annotations: - summary: 'Deployment of {{$labels.exported_namespace}}/{{$labels.name}} failed.' - description: 'Deployment of {{$labels.exported_namespace}}/{{$labels.name}} failed - observed replicas != intended replicas.' + summary: Deployment of {{$labels.name}} failed. + description: Deployment of {{$labels.namespace}}/{{$labels.name}} failed - observed replicas != intended replicas. + - alert: DeploymentReplicasUnavailable + expr: kube_deployment_status_replicas_unavailable > 0 + for: 5m + labels: + severity: critical + annotations: + summary: Deployment `{{ $labels.deployment }}` has `{{ $value }}` replica(s) unavailable. + description: Deployment `{{ $labels.deployment }}` has `{{ $value }}` replica(s) unavailable. + runbook: https://kubernetes.io/docs/tasks/debug-application-cluster/debug-pod-replication-controller/ - alert: PodRestartingTooMuch - expr: rate(kube_pod_container_status_restarts[10m])*1200 > 2 + expr: rate(kube_pod_container_status_restarts[5m])*600 > 2 for: 5m labels: severity: warning annotations: - summary: 'Pod {{$labels.namespace}}/{{$label.name}} restarting too much.' - description: 'Pod {{$labels.namespace}}/{{$label.name}} restarting too much.' + summary: Pod `{{ $labels.pod }}` is restarting too much. + description: Pod `{{ $labels.namespace }}/{{ $labels.pod }}` is restarting too much.' + runbook: https://kubernetes.io/docs/tasks/debug-application-cluster/debug-pod-replication-controller/ diff --git a/operator/deploy b/operator/deploy index afc58f7..4a21d21 100755 --- a/operator/deploy +++ b/operator/deploy @@ -3,10 +3,10 @@ ######################################################################################### #components default version ######################################################################################### -GRAFANA_DEFAULT_VERSION=4.5.1 +GRAFANA_DEFAULT_VERSION=4.5.2 PROMETHEUS_DEFAULT_VERSION=v2.0.0-beta.5 PROMETHEUS_OPERATOR_DEFAULT_VERSION=v0.13.0 -ALERT_MANAGER_DEFAULT_VERSION=v0.8.0 +ALERT_MANAGER_DEFAULT_VERSION=v0.9.0 NODE_EXPORTER_DEFAULT_VERSION=v0.14.0 KUBE_STATE_METRICS_DEFAULT_VERSION=v1.0.1 diff --git a/operator/manifests/prometheus/prometheus-k8s-rules.yaml b/operator/manifests/prometheus/prometheus-k8s-rules.yaml index 79a2b53..a37e8c7 100644 --- a/operator/manifests/prometheus/prometheus-k8s-rules.yaml +++ b/operator/manifests/prometheus/prometheus-k8s-rules.yaml @@ -50,24 +50,34 @@ data: labels: severity: critical annotations: - summary: 'Deployment of {{$labels.exported_namespace}}/{{$labels.name}} failed' - description: 'Deployment of {{$labels.exported_namespace}}/{{$labels.name}} failed - observed generation != intended generation' + summary: Deployment of {{$labels.namespace}}/{{$labels.name}} failed + description: Deployment of {{$labels.namespace}}/{{$labels.name}} failed - observed generation != intended generation - alert: DeploymentReplicasMismatch expr: kube_deployment_spec_replicas{job="kube-state-metrics"} != kube_deployment_status_availableReplicas{job="kube-state-metrics"} for: 5m labels: severity: critical annotations: - summary: 'Deployment of {{$labels.exported_namespace}}/{{$labels.name}} failed.' - description: 'Deployment of {{$labels.exported_namespace}}/{{$labels.name}} failed - observed replicas != intended replicas.' + summary: Deployment of {{$labels.name}} failed. + description: Deployment of {{$labels.namespace}}/{{$labels.name}} failed - observed replicas != intended replicas. + - alert: DeploymentReplicasUnavailable + expr: kube_deployment_status_replicas_unavailable > 0 + for: 5m + labels: + severity: critical + annotations: + summary: Deployment `{{ $labels.deployment }}` has `{{ $value }}` replica(s) unavailable. + description: Deployment `{{ $labels.deployment }}` has `{{ $value }}` replica(s) unavailable. + runbook: https://kubernetes.io/docs/tasks/debug-application-cluster/debug-pod-replication-controller/ - alert: PodRestartingTooMuch - expr: rate(kube_pod_container_status_restarts[10m])*1200 > 2 + expr: rate(kube_pod_container_status_restarts[5m])*600 > 2 for: 5m labels: severity: warning annotations: - summary: 'Pod {{$labels.namespace}}/{{$label.name}} restarting too much.' - description: 'Pod {{$labels.namespace}}/{{$label.name}} restarting too much.' + summary: Pod `{{ $labels.pod }}` is restarting too much. + description: Pod `{{ $labels.namespace }}/{{ $labels.pod }}` is restarting too much.' + runbook: https://kubernetes.io/docs/tasks/debug-application-cluster/debug-pod-replication-controller/ etcd3.rules: |+ groups: - name: etcd.rules