forked from openfaas/faas-netes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
prometheus-cfg.yaml
204 lines (186 loc) · 6.75 KB
/
prometheus-cfg.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
{{- $functionNs := default .Release.Namespace .Values.functionNamespace }}
{{- if .Values.prometheus.create }}
---
kind: ConfigMap
apiVersion: v1
metadata:
labels:
app: {{ template "openfaas.name" . }}
chart: {{ .Chart.Name }}-{{ .Chart.Version }}
component: prometheus-config
heritage: {{ .Release.Service }}
release: {{ .Release.Name }}
name: prometheus-config
namespace: {{ .Release.Namespace | quote }}
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: 'faas-monitor'
rule_files:
- 'alert.rules.yml'
{{- if .Values.openfaasPro }}
- 'prometheus-rules.yml'
{{- end }}
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'prometheus'
scrape_interval: 10s
static_configs:
- targets: ['localhost:9090']
# Capture endpoints in the openfaas namespace with a scrape annotation
# such as the gateway-provider service.
- job_name: 'openfaas-endpoints'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- {{ .Release.Namespace }}
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
separator: ;
regex: ([^:]+)(?::\d+)?;(\d+)
target_label: __address__
replacement: $1:$2
action: replace
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
# - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_port]
# action: keep
# regex: true
- job_name: 'kubernetes-pods'
scrape_interval: 5s
honor_labels: false
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- {{ .Release.Namespace }}
{{- if ne $functionNs (.Release.Namespace | toString) }}
- {{ $functionNs }}
{{- end }}
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
{{- if .Values.openfaasPro }}
- job_name: 'kubernetes-resource-metrics'
scrape_interval: 10s
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/resource
metric_relabel_configs:
- source_labels: [__name__]
regex: (pod)_(cpu|memory)_(.+)
action: keep
# Exclude container metrics
- source_labels: [__name__]
regex: container_(.+)
action: drop
- action: replace
source_labels:
- namespace
regex: '(.*)'
replacement: '$1'
target_label: kubernetes_namespace
# Output deployment name from Pod
- action: replace
source_labels:
- pod
regex: '^([0-9a-zA-Z-]+)+(-[0-9a-zA-Z]+-[0-9a-zA-Z]+)$'
replacement: '$1'
target_label: deployment_name
# Output fully-qualified function name fn.ns
- source_labels: [deployment_name, kubernetes_namespace]
separator: ";"
regex: '(.*);(.*)'
replacement: '${1}.${2}'
target_label: "function_name"
{{- end }}
alert.rules.yml: |
groups:
- name: openfaas
rules:
- alert: service_down
expr: up == 0
{{- if or (eq .Values.openfaasPro false) (eq (or .Values.ceScaling false) true) }}
- alert: APIHighInvocationRate
expr: sum(rate(gateway_function_invocation_total{code="200"}[10s])) BY (function_name) > 5
for: 5s
labels:
service: gateway
severity: major
annotations:
description: High invocation total on "{{ "{{" }}$labels.function_name{{ "}}" }}"
summary: High invocation total on "{{ "{{" }}$labels.function_name{{ "}}" }}"
{{- end }}
{{- if .Values.openfaasPro }}
prometheus-rules.yml: |
groups:
- name: load
rules:
- record: job:function_current_load:sum
expr: sum by (function_name) ( rate( gateway_function_invocation_total{}[30s] ) ) and avg by (function_name) (gateway_service_target_load{scaling_type="rps"}) > 1
labels:
scaling_type: rps
- record: job:function_current_load:sum
expr: sum by (function_name) ( max_over_time( gateway_function_invocation_inflight[45s:5s])) and on (function_name) avg by(function_name) (gateway_service_target_load{scaling_type="capacity"}) > bool 1
labels:
scaling_type: capacity
- record: job:function_current_load:sum
expr: sum(irate ( pod_cpu_usage_seconds_total{}[1m])*1000) by (function_name) * on (function_name) avg by (function_name) (gateway_service_target_load{scaling_type="cpu"} > bool 1 )
labels:
scaling_type: cpu
- name: recently_started_1m
interval: 10s
rules:
- record: job:function_current_started:max_sum
expr: max_over_time(sum by (function_name) (rate( gateway_function_invocation_started{}[1m]))[1m:5s]) > 0
{{- end }}
{{- end }}