Skip to content

Commit

Permalink
crabserver - lower threshold for pod restart with too many open fds
Browse files Browse the repository at this point in the history
  • Loading branch information
mapellidario committed Jun 12, 2024
1 parent 93e44b4 commit 13b6680
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 40 deletions.
22 changes: 4 additions & 18 deletions kubernetes/cmsweb/monitoring/prometheus/rules/crabserver.rules
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
groups:
- name: crabserver
rules:
- record: avg_open_fds_10m
expr: avg_over_time(crabserver_process_open_fds[10m])
- record: avg_open_fds_30m
expr: avg_over_time(crabserver_process_open_fds[30m])
- record: avg_open_fds_8m
expr: avg_over_time(crabserver_process_open_fds[8m])
- alert: CRAB server is down
expr: crabserver_num_cpus == 0
for: 5m
Expand All @@ -17,20 +15,8 @@ groups:
annotations:
summary: "crabserver {{ $labels.env }} is down"
description: "{{ $labels.env }} has been down for more than 5m"
- alert: CRAB server service has large number of fds
expr: avg_open_fds_10m > 75
for: 1m
labels:
severity: warning
tag: cmsweb
service: crab
host: "{{ $labels.host }}"
action: Please check CRAB server on {{ $labels.instance }} and possibly restart it
annotations:
summary: "CRAB {{ $labels.env }} environment"
description: "{{ $labels.env }} has large level of fds {{ $value }} (avg 10m) for more than 1m"
- alert: CRAB server service has high number of fds
expr: avg_open_fds_30m > 100
expr: avg_open_fds_8m > 50
for: 1m
labels:
severity: high
Expand All @@ -40,4 +26,4 @@ groups:
action: Please restart CRAB server on {{ $labels.instance }}
annotations:
summary: "CRAB {{ $labels.env }} environment"
description: "{{ $labels.env }} has high level of fds {{ $value }} (avg 30m) for more than 1m"
description: "{{ $labels.env }} has high level of fds {{ $value }} (avg 8m) for more than 1m"
25 changes: 3 additions & 22 deletions kubernetes/cmsweb/monitoring/prometheus/rules/crabserver.test
Original file line number Diff line number Diff line change
Expand Up @@ -25,27 +25,8 @@ tests:
description: "prod has been down for more than 5m"
- interval: 1m
input_series:
- series: 'avg_open_fds_10m{env="prod",instance="test-instance",host="k8s-test"}'
values: '76+1x100'
alert_rule_test:
- eval_time: 10m
alertname: CRAB server service has large number of fds
exp_alerts:
- exp_labels:
severity: warning
tag: cmsweb
service: crab
host: k8s-test
action: Please check CRAB server on test-instance and possibly restart it
instance: test-instance
env: prod
exp_annotations:
summary: "CRAB prod environment"
description: "prod has large level of fds 86 (avg 10m) for more than 1m"
- interval: 1m
input_series:
- series: 'avg_open_fds_30m{env="prod",instance="test-instance",host="k8s-test"}'
values: '101+1x100'
- series: 'avg_open_fds_8m{env="prod",instance="test-instance",host="k8s-test"}'
values: '51+1x100'
alert_rule_test:
- eval_time: 10m
alertname: CRAB server service has high number of fds
Expand All @@ -60,4 +41,4 @@ tests:
env: prod
exp_annotations:
summary: "CRAB prod environment"
description: "prod has high level of fds 111 (avg 30m) for more than 1m"
description: "prod has high level of fds 61 (avg 8m) for more than 1m"

0 comments on commit 13b6680

Please sign in to comment.