From 13b6680a345f4e5254738b3dbc6f9f087760f7ba Mon Sep 17 00:00:00 2001 From: Dario Mapelli Date: Wed, 12 Jun 2024 16:20:59 +0200 Subject: [PATCH] crabserver - lower threshold for pod restart with too many open fds --- .../prometheus/rules/crabserver.rules | 22 +++------------- .../prometheus/rules/crabserver.test | 25 +++---------------- 2 files changed, 7 insertions(+), 40 deletions(-) diff --git a/kubernetes/cmsweb/monitoring/prometheus/rules/crabserver.rules b/kubernetes/cmsweb/monitoring/prometheus/rules/crabserver.rules index 733313c05..498c240ca 100644 --- a/kubernetes/cmsweb/monitoring/prometheus/rules/crabserver.rules +++ b/kubernetes/cmsweb/monitoring/prometheus/rules/crabserver.rules @@ -1,10 +1,8 @@ groups: - name: crabserver rules: - - record: avg_open_fds_10m - expr: avg_over_time(crabserver_process_open_fds[10m]) - - record: avg_open_fds_30m - expr: avg_over_time(crabserver_process_open_fds[30m]) + - record: avg_open_fds_8m + expr: avg_over_time(crabserver_process_open_fds[8m]) - alert: CRAB server is down expr: crabserver_num_cpus == 0 for: 5m @@ -17,20 +15,8 @@ groups: annotations: summary: "crabserver {{ $labels.env }} is down" description: "{{ $labels.env }} has been down for more than 5m" - - alert: CRAB server service has large number of fds - expr: avg_open_fds_10m > 75 - for: 1m - labels: - severity: warning - tag: cmsweb - service: crab - host: "{{ $labels.host }}" - action: Please check CRAB server on {{ $labels.instance }} and possibly restart it - annotations: - summary: "CRAB {{ $labels.env }} environment" - description: "{{ $labels.env }} has large level of fds {{ $value }} (avg 10m) for more than 1m" - alert: CRAB server service has high number of fds - expr: avg_open_fds_30m > 100 + expr: avg_open_fds_8m > 50 for: 1m labels: severity: high @@ -40,4 +26,4 @@ groups: action: Please restart CRAB server on {{ $labels.instance }} annotations: summary: "CRAB {{ $labels.env }} environment" - description: "{{ $labels.env }} has high level of fds {{ $value }} (avg 30m) for more than 1m" + description: "{{ $labels.env }} has high level of fds {{ $value }} (avg 8m) for more than 1m" diff --git a/kubernetes/cmsweb/monitoring/prometheus/rules/crabserver.test b/kubernetes/cmsweb/monitoring/prometheus/rules/crabserver.test index 47898dd32..6c245ce8c 100644 --- a/kubernetes/cmsweb/monitoring/prometheus/rules/crabserver.test +++ b/kubernetes/cmsweb/monitoring/prometheus/rules/crabserver.test @@ -25,27 +25,8 @@ tests: description: "prod has been down for more than 5m" - interval: 1m input_series: - - series: 'avg_open_fds_10m{env="prod",instance="test-instance",host="k8s-test"}' - values: '76+1x100' - alert_rule_test: - - eval_time: 10m - alertname: CRAB server service has large number of fds - exp_alerts: - - exp_labels: - severity: warning - tag: cmsweb - service: crab - host: k8s-test - action: Please check CRAB server on test-instance and possibly restart it - instance: test-instance - env: prod - exp_annotations: - summary: "CRAB prod environment" - description: "prod has large level of fds 86 (avg 10m) for more than 1m" -- interval: 1m - input_series: - - series: 'avg_open_fds_30m{env="prod",instance="test-instance",host="k8s-test"}' - values: '101+1x100' + - series: 'avg_open_fds_8m{env="prod",instance="test-instance",host="k8s-test"}' + values: '51+1x100' alert_rule_test: - eval_time: 10m alertname: CRAB server service has high number of fds @@ -60,4 +41,4 @@ tests: env: prod exp_annotations: summary: "CRAB prod environment" - description: "prod has high level of fds 111 (avg 30m) for more than 1m" + description: "prod has high level of fds 61 (avg 8m) for more than 1m"