diff --git a/charts/pyrra/Chart.yaml b/charts/pyrra/Chart.yaml index db6b95124..542c2bbd8 100644 --- a/charts/pyrra/Chart.yaml +++ b/charts/pyrra/Chart.yaml @@ -16,7 +16,7 @@ type: application # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.13.4 +version: 0.14.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/charts/pyrra/README.md b/charts/pyrra/README.md index 3e6387c13..dc72cf41e 100644 --- a/charts/pyrra/README.md +++ b/charts/pyrra/README.md @@ -1,6 +1,6 @@ # pyrra -![Version: 0.13.4](https://img.shields.io/badge/Version-0.13.4-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v0.7.7](https://img.shields.io/badge/AppVersion-v0.7.7-informational?style=flat-square) +![Version: 0.14.0](https://img.shields.io/badge/Version-0.14.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v0.7.7](https://img.shields.io/badge/AppVersion-v0.7.7-informational?style=flat-square) SLO manager and alert generator @@ -44,21 +44,36 @@ The dashboards can be deployed using a ConfigMap and get's automatically [reload | ingress.tls | list | `[]` | | | nameOverride | string | `""` | overrides chart name | | nodeSelector | object | `{}` | node selector for scheduling server pod | +| operatorMetricsAddress | string | `":8080"` | Address to expose operator metrics | | podAnnotations | object | `{}` | additional annotations for server pod | | podSecurityContext | object | `{}` | additional security context for server pod | | prometheusExternalUrl | string | `""` | url to public-facing prometheus UI in case it differs from prometheusUrl | +| prometheusRule.enabled | bool | `false` | enables creation of PrometheusRules to monitor Pyrra | +| prometheusRule.labels | object | `{}` | Set labels that will be applied on all PrometheusRules (alerts) | +| prometheusRule.pyrraReconciliationError.severity | string | `"warning"` | Set severity for PyrraReconciliationError alert | | prometheusUrl | string | `"http://prometheus-operated.monitoring.svc.cluster.local:9090"` | url to prometheus instance with metrics | | resources | object | `{}` | resource limits and requests for server pod | | securityContext | object | `{}` | additional security context for server | | service.annotations | object | `{}` | Annotations to add to the service | | service.nodePort | string | `""` | node port for HTTP, choose port between <30000-32767> | +| service.operatorMetricsPort | int | `8080` | service port for operator metrics | | service.port | int | `9099` | service port for server | | service.type | string | `"ClusterIP"` | service type for server | | serviceAccount.annotations | object | `{}` | Annotations to add to the service account | | serviceAccount.create | bool | `true` | Specifies whether a service account should be created | | serviceAccount.name | string | `""` | If not set and create is true, a name is generated using the fullname template | | serviceMonitor.enabled | bool | `false` | enables servicemonitor for server monitoring | +| serviceMonitor.interval | string | `""` | Set interval for scraping metrics | +| serviceMonitor.jobLabel | string | `""` | provides the possibility to override the jobName if needed | | serviceMonitor.labels | object | `{}` | Set labels for the ServiceMonitor, use this to define your scrape label for Prometheus Operator | +| serviceMonitor.metricRelabelings | list | `[]` | Set metric relabelings for the ServiceMonitor | +| serviceMonitor.relabelings | list | `[]` | Set relabelings for the ServiceMonitor | +| serviceMonitorOperator.enabled | bool | `false` | enables servicemonitor for operator monitoring | +| serviceMonitorOperator.interval | string | `""` | Set interval for scraping metrics | +| serviceMonitorOperator.jobLabel | string | `""` | provides the possibility to override the jobName if needed | +| serviceMonitorOperator.labels | object | `{}` | Set labels for the ServiceMonitor, use this to define your scrape label for Prometheus Operator | +| serviceMonitorOperator.metricRelabelings | list | `[]` | Set metric relabelings for the ServiceMonitor | +| serviceMonitorOperator.relabelings | list | `[]` | Set relabelings for the ServiceMonitor | | tolerations | object | `{}` | tolerations for scheduling server pod | | validatingWebhookConfiguration.enabled | bool | `false` | enables admission webhook for server to validate SLOs, this requires cert-manager to be installed | diff --git a/charts/pyrra/templates/_helpers.tpl b/charts/pyrra/templates/_helpers.tpl index 607bc8b11..a9bdacbae 100644 --- a/charts/pyrra/templates/_helpers.tpl +++ b/charts/pyrra/templates/_helpers.tpl @@ -64,3 +64,10 @@ Create the name of the service account to use {{- default "default" .Values.serviceAccount.name }} {{- end }} {{- end }} + +{{/* +Operator metrics port +*/}} +{{- define "pyrra.operatorMetricsPort" -}} +{{ (split ":" .Values.operatorMetricsAddress)._1 }} +{{- end }} diff --git a/charts/pyrra/templates/deployment.yaml b/charts/pyrra/templates/deployment.yaml index 5b93db724..77209d85d 100644 --- a/charts/pyrra/templates/deployment.yaml +++ b/charts/pyrra/templates/deployment.yaml @@ -38,6 +38,9 @@ spec: {{- if and .Values.validatingWebhookConfiguration.enabled ($.Capabilities.APIVersions.Has "cert-manager.io/v1") }} - --disable-webhooks=false {{- end }} + {{- if .Values.operatorMetricsAddress }} + - --metrics-addr={{ .Values.operatorMetricsAddress }} + {{- end }} {{- with .Values.extraKubernetesArgs }} {{- toYaml . | nindent 12 }} {{- end }} @@ -48,6 +51,9 @@ spec: - mountPath: /tmp/k8s-webhook-server/serving-certs name: certs {{- end }} + ports: + - name: op-metrics + containerPort: {{ include "pyrra.operatorMetricsPort" . }} - name: {{ .Chart.Name }} securityContext: {{- toYaml .Values.securityContext | nindent 12 }} diff --git a/charts/pyrra/templates/prometheusrule.yaml b/charts/pyrra/templates/prometheusrule.yaml new file mode 100644 index 000000000..0599a5f0d --- /dev/null +++ b/charts/pyrra/templates/prometheusrule.yaml @@ -0,0 +1,26 @@ +{{- if .Values.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "pyrra.fullname" . }}-prometheusrule + labels: + {{- include "pyrra.labels" . | nindent 4 }} +spec: + groups: + - name: {{ include "pyrra.fullname" . }}-prometheusrules + rules: + - alert: PyrraReconciliationError + # We use a 20m interval as the controller only reconciles roughly once every 15 minutes. This interval is + # large enough to stop the rate from dropping to 0, causing a flapping alert. + # The interval is short enough to resolve within a reasonable time after a broken SLO has been fixed/removed. + expr: sum by (job) (rate(controller_runtime_reconcile_errors_total{controller="servicelevelobjective"}[20m])) > 0 + for: 1m + labels: + severity: {{ .Values.prometheusRule.pyrraReconciliationError.severity }} + {{- if .Values.prometheusRule.labels }} + {{- toYaml .Values.prometheusRule.labels | nindent 12 }} + {{- end }} + annotations: + summary: Failed to reconcile state + description: 'Pyrra Kubernetes operator failed to reconcile. Check logs for invalid ServiceLevelObjectives.' +{{- end }} diff --git a/charts/pyrra/templates/service.yaml b/charts/pyrra/templates/service.yaml index 1b7c6ced6..bea3b923c 100644 --- a/charts/pyrra/templates/service.yaml +++ b/charts/pyrra/templates/service.yaml @@ -20,5 +20,8 @@ spec: {{- if .Values.service.nodePort }} nodePort: {{ .Values.service.nodePort }} {{- end }} + - name: op-metrics + port: {{ .Values.service.operatorMetricsPort }} + targetPort: {{ include "pyrra.operatorMetricsPort" . }} selector: {{- include "pyrra.selectorLabels" . | nindent 4 }} diff --git a/charts/pyrra/templates/servicemonitor-operator.yaml b/charts/pyrra/templates/servicemonitor-operator.yaml new file mode 100644 index 000000000..96f3f127c --- /dev/null +++ b/charts/pyrra/templates/servicemonitor-operator.yaml @@ -0,0 +1,32 @@ +{{- if .Values.serviceMonitorOperator.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ template "pyrra.fullname" . }}-operator + labels: + {{- include "pyrra.labels" . | nindent 4 }} + {{- if .Values.serviceMonitorOperator.labels }} + {{- toYaml .Values.serviceMonitorOperator.labels | nindent 4}} + {{- end }} +spec: + jobLabel: {{ .Values.serviceMonitorOperator.jobLabel | default (printf "%s-operator" (include "pyrra.fullname" .)) }} + selector: + matchLabels: + {{- include "pyrra.selectorLabels" . | nindent 6 }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + endpoints: + - port: op-metrics + {{- if .Values.serviceMonitorOperator.interval }} + interval: {{ .Values.serviceMonitorOperator.interval }} + {{- end }} + {{- if .Values.serviceMonitorOperator.metricRelabelings }} + metricRelabelings: + {{- toYaml .Values.serviceMonitorOperator.metricRelabelings | nindent 6 }} + {{- end }} + {{- if .Values.serviceMonitorOperator.relabelings }} + relabelings: + {{- toYaml .Values.serviceMonitorOperator.relabelings | nindent 6 }} + {{- end }} +{{- end }} diff --git a/charts/pyrra/templates/servicemonitor.yaml b/charts/pyrra/templates/servicemonitor-server.yaml similarity index 87% rename from charts/pyrra/templates/servicemonitor.yaml rename to charts/pyrra/templates/servicemonitor-server.yaml index 7959aa11c..63c6ef80f 100644 --- a/charts/pyrra/templates/servicemonitor.yaml +++ b/charts/pyrra/templates/servicemonitor-server.yaml @@ -23,10 +23,10 @@ spec: {{- end }} {{- if .Values.serviceMonitor.metricRelabelings }} metricRelabelings: - {{ toYaml .Values.serviceMonitor.metricRelabelings | indent 4 }} + {{- toYaml .Values.serviceMonitor.metricRelabelings | nindent 6 }} {{- end }} {{- if .Values.serviceMonitor.relabelings }} relabelings: - {{ toYaml .Values.serviceMonitor.relabelings | indent 4 }} + {{- toYaml .Values.serviceMonitor.relabelings | nindent 6 }} {{- end }} {{- end }} diff --git a/charts/pyrra/values.yaml b/charts/pyrra/values.yaml index d23763fe2..a940e8cbc 100644 --- a/charts/pyrra/values.yaml +++ b/charts/pyrra/values.yaml @@ -20,6 +20,8 @@ additionalLabels: {} extraApiArgs: [] # -- Extra args for Pyrra's Kubernetes container extraKubernetesArgs: [] +# -- Address to expose operator metrics +operatorMetricsAddress: ":8080" serviceAccount: # -- Specifies whether a service account should be created @@ -52,6 +54,8 @@ service: # -- service nodePort to expose # -- node port for HTTP, choose port between <30000-32767> nodePort: "" + # -- service port for operator metrics + operatorMetricsPort: 8080 ingress: # -- enables ingress for server UI @@ -74,7 +78,7 @@ ingress: # - chart-example.local # -- resource limits and requests for server pod -resources: {} +resources: {} # We usually recommend not to specify default resources and to leave this as a conscious # choice for the user. This also increases chances charts run on environments with little # resources, such as Minikube. If you do want to specify resources, uncomment the following @@ -97,7 +101,36 @@ serviceMonitor: # -- Set labels for the ServiceMonitor, use this to define your scrape label for Prometheus Operator labels: {} # -- provides the possibility to override the jobName if needed - # jobLabel: fancy-pyrra-server + jobLabel: "" + # -- Set interval for scraping metrics + interval: "" + # -- Set metric relabelings for the ServiceMonitor + metricRelabelings: [] + # -- Set relabelings for the ServiceMonitor + relabelings: [] + +serviceMonitorOperator: + # -- enables servicemonitor for operator monitoring + enabled: false + # -- Set labels for the ServiceMonitor, use this to define your scrape label for Prometheus Operator + labels: {} + # -- provides the possibility to override the jobName if needed + jobLabel: "" + # -- Set interval for scraping metrics + interval: "" + # -- Set metric relabelings for the ServiceMonitor + metricRelabelings: [] + # -- Set relabelings for the ServiceMonitor + relabelings: [] + +prometheusRule: + # -- enables creation of PrometheusRules to monitor Pyrra + enabled: false + # -- Set labels that will be applied on all PrometheusRules (alerts) + labels: {} + pyrraReconciliationError: + # -- Set severity for PyrraReconciliationError alert + severity: warning genericRules: # -- enables generate Pyrra generic recording rules. Pyrra generates metrics with the same name for each SLO.