Skip to content

Commit

Permalink
feat(base-cluster/monitoring)!: migrate promtail to alloy (#1347)
Browse files Browse the repository at this point in the history
- update loki to an up-to-date version
- also replace otel collector with alloy, saving resources!
  • Loading branch information
cwrau authored Mar 6, 2025
1 parent 84442a3 commit 24db445
Show file tree
Hide file tree
Showing 15 changed files with 451 additions and 186 deletions.
2 changes: 2 additions & 0 deletions .github/trusted_registries.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ ghcr.io:
aquasecurity: ALL_IMAGES
kyverno: ALL_IMAGES
teutonet: ALL_IMAGES
jimmidyson:
configmap-reload: ALL_TAGS
quay.io:
cilium: ALL_IMAGES
jetstack: ALL_IMAGES
Expand Down
21 changes: 21 additions & 0 deletions charts/base-cluster/README.md.gotmpl
Original file line number Diff line number Diff line change
Expand Up @@ -315,4 +315,25 @@ upgrade, they will be recreated in version 6.
This also makes kyverno HA, so be aware that kyverno will need more resources in
you cluster.

### 6.x.x -> 7.0.0

This release allows the user to use the predefined k8s ClusterRoles
(`admin`, `edit`, `view`, ...).

This usage might clash with custom roles named `admin`, `edit`, `view`, ... and
therefore needs to be adjusted

### 7.x.x -> 8.0.0

This release migrates the now unsupported `loki-stack` to the normal `loki` helm
chart.

This is a breaking change because, apart from a new storage engine, the deployment
also moves from the `loki` namespace to `monitoring` to keep in line with every
other monitoring deployment, which in turn also deletes the `loki` namespace

This also replaces `promtail` and the `otel-collector` with `alloy`, using
<https://github.com/teutonet/teutonet-helm-charts/blob/main/charts/common/templates/_telemetry.tpl>
makes this a drop-in change.

{{ .Files.Get "values.md" }}
7 changes: 4 additions & 3 deletions charts/base-cluster/templates/ingress/nginx.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ spec:
serviceMonitor:
enabled: {{ .Values.monitoring.prometheus.enabled }}
additionalLabels: {{- toYaml .Values.monitoring.labels | nindent 12 }}
{{- if .Values.monitoring.tracing.enabled }}
{{- $telemetryConf := include "common.telemetry.conf" (dict "protocol" "otlp") | fromYaml }}
{{- if and $telemetryConf.enabled .Values.monitoring.prometheus.enabled }}
opentelemetry:
enabled: true
{{- if and .Values.global.imageRegistry false }}
Expand All @@ -40,10 +41,10 @@ spec:
use-gzip: true
enable-brotli: true
enable-underscores-in-headers: true
{{- if .Values.monitoring.tracing.enabled }}
{{- if $telemetryConf.enabled }}
enable-opentelemetry: true
opentelemetry-operation-name: ingress
otlp-collector-host: open-telemetry-collector-opentelemetry-collector.monitoring
otlp-collector-host: {{ $telemetryConf.host }}
{{- end }}
service:
annotations:
Expand Down
14 changes: 8 additions & 6 deletions charts/base-cluster/templates/kyverno/kyverno.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,25 +62,27 @@ spec:
# this only works in version 3
admissionController:
replicas: 3
{{- if and .Values.monitoring.tracing.enabled .Values.monitoring.prometheus.enabled }}
{{- $telemetryConf := include "common.telemetry.conf" (dict "protocol" "jaeger" "serviceProtocol" "grpc") | fromYaml -}}
{{- $telemetryEnabled := and $telemetryConf.enabled .Values.monitoring.prometheus.enabled -}}
{{- if $telemetryEnabled }}
tracing: &tracingConfig
enabled: true
address: open-telemetry-collector-opentelemetry-collector.monitoring
port: 14250 # jaeger-grpc
address: {{ $telemetryConf.host }}
port: {{ $telemetryConf.port }}
{{- end }}
backgroundController:
replicas: 2
{{- if and .Values.monitoring.tracing.enabled .Values.monitoring.prometheus.enabled }}
{{- if $telemetryEnabled }}
tracing: *tracingConfig
{{- end }}
reportsController:
replicas: 2
{{- if and .Values.monitoring.tracing.enabled .Values.monitoring.prometheus.enabled }}
{{- if $telemetryEnabled }}
tracing: *tracingConfig
{{- end }}
cleanupController:
replicas: 2
{{- if and .Values.monitoring.tracing.enabled .Values.monitoring.prometheus.enabled }}
{{- if $telemetryEnabled }}
tracing: *tracingConfig
{{- end }}
podDisruptionBudget:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ spec:
namespaces:
- kube-system
- default
{{ $lokiPromtail := dict "resources" (dict "namespaces" (list "loki") "kinds" (list "Pod") "names" (list "loki-promtail-*")) -}}
{{ $alloy := dict "resources" (dict "namespaces" (list "monitoring") "kinds" (list "Pod") "names" (list "alloy-*")) -}}
{{- $syncEtcdSecret := dict "resources" (dict "namespaces" (list "monitoring") "kinds" (list "Pod") "names" (list "sync-etcd-secret-*")) -}}
{{- $nodeExporter := dict "resources" (dict "namespaces" (list "monitoring") "kinds" (list "Pod") "names" (list "kube-prometheus-stack-prometheus-node-exporter-*")) -}}
{{- $nfsServerProvisioner := dict "resources" (dict "namespaces" (list "nfs-server-provisioner") "kinds" (list "Pod") "names" (list "nfs-server-provisioner-0")) -}}
Expand All @@ -46,8 +46,8 @@ spec:
{{- $disallowHostPorts := list -}}

{{- if .Values.monitoring.loki.enabled -}}
{{- $disallowHostPath = append $disallowHostPath $lokiPromtail -}}
{{- $runAsNonRoot = append $runAsNonRoot $lokiPromtail -}}
{{- $disallowHostPath = append $disallowHostPath $alloy -}}
{{- $runAsNonRoot = append $runAsNonRoot $alloy -}}
{{- end -}}
{{- if .Values.monitoring.prometheus.enabled -}}
{{- $disallowHostPath = append $disallowHostPath $syncEtcdSecret -}}
Expand Down
259 changes: 259 additions & 0 deletions charts/base-cluster/templates/monitoring/alloy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
{{- if and .Values.monitoring.prometheus.enabled (or .Values.monitoring.tracing.enabled .Values.monitoring.loki.enabled) -}}
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: alloy
namespace: monitoring
labels: {{- include "common.labels.standard" $ | nindent 4 }}
app.kubernetes.io/component: alloy
app.kubernetes.io/part-of: monitoring
spec:
chart:
spec: {{- include "base-cluster.helm.chartSpec" (dict "repo" "grafana" "chart" "alloy" "context" $) | nindent 6 }}
interval: 1h
driftDetection:
mode: enabled
install:
timeout: 10m0s
crds: Skip
upgrade:
timeout: 10m0s
crds: Skip
dependsOn:
- name: kube-prometheus-stack
namespace: monitoring
values:
{{- if .Values.global.imageRegistry }}
global:
image:
registry: {{ $.Values.global.imageRegistry }}
{{- end }}
alloy:
enableReporting: false
resources: {{- include "common.resources" .Values.monitoring.loki.promtail | nindent 8 }}
{{- if .Values.monitoring.loki.enabled }}
mounts:
varlog: true
{{- end }}
securityContext:
seccompProfile:
type: RuntimeDefault
configMap:
content: |
{{- if .Values.monitoring.loki.enabled }}
discovery.kubernetes "pods" {
role = "pod"
}
discovery.relabel "pods" {
targets = discovery.kubernetes.pods.targets
rule {
source_labels = ["__meta_kubernetes_pod_controller_name"]
regex = "([0-9a-z-.]+?)(-[0-9a-f]{8,10})?"
target_label = "__tmp_controller_name"
}
rule {
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name", "__meta_kubernetes_pod_label_app", "__tmp_controller_name", "__meta_kubernetes_pod_name"]
regex = "^;*([^;]+)(;.*)?$"
target_label = "app"
}
rule {
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_instance", "__meta_kubernetes_pod_label_instance"]
regex = "^;*([^;]+)(;.*)?$"
target_label = "instance"
}
rule {
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_component", "__meta_kubernetes_pod_label_component"]
regex = "^;*([^;]+)(;.*)?$"
target_label = "component"
}
rule {
source_labels = ["__meta_kubernetes_pod_node_name"]
target_label = "node_name"
}
rule {
source_labels = ["__meta_kubernetes_namespace"]
target_label = "namespace"
}
rule {
source_labels = ["namespace", "app"]
separator = "/"
target_label = "job"
}
rule {
source_labels = ["__meta_kubernetes_pod_name"]
target_label = "pod"
}
rule {
source_labels = ["__meta_kubernetes_pod_container_name"]
target_label = "container"
}
rule {
source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
separator = "/"
target_label = "__path__"
replacement = "/var/log/pods/*$1/*.log"
}
rule {
source_labels = ["__meta_kubernetes_pod_annotationpresent_kubernetes_io_config_hash", "__meta_kubernetes_pod_annotation_kubernetes_io_config_hash", "__meta_kubernetes_pod_container_name"]
separator = "/"
regex = "true/(.*)"
target_label = "__path__"
replacement = "/var/log/pods/*$1/*.log"
}
}
local.file_match "pods" {
path_targets = discovery.relabel.pods.output
}
loki.source.file "pods" {
targets = local.file_match.pods.targets
forward_to = [loki.process.pods.receiver]
}
loki.process "pods" {
forward_to = [loki.write.default.receiver]
stage.cri { }
}
loki.write "default" {
endpoint {
url = "http://loki:3100/loki/api/v1/push"
}
external_labels = {}
}
{{- end }}
{{- if .Values.monitoring.tracing.enabled }}
otelcol.receiver.otlp "default" {
grpc { }
http { }
output {
traces = [otelcol.processor.k8sattributes.default.input]
}
}
otelcol.receiver.jaeger "default" {
protocols {
grpc { }
thrift_http { }
thrift_compact {
max_packet_size = "63KiB488B"
}
}
output {
traces = [otelcol.processor.k8sattributes.default.input]
}
}
otelcol.receiver.zipkin "default" {
output {
traces = [otelcol.processor.k8sattributes.default.input]
}
}
otelcol.processor.k8sattributes "default" {
auth_type = "serviceAccount"
extract {
metadata = ["k8s.namespace.name", "k8s.deployment.name", "k8s.statefulset.name", "k8s.daemonset.name", "k8s.cronjob.name", "k8s.job.name", "k8s.node.name", "k8s.pod.name", "k8s.pod.uid", "k8s.pod.start_time"]
}
pod_association {
source {
from = "resource_attribute"
name = "k8s.pod.ip"
}
}
pod_association {
source {
from = "resource_attribute"
name = "k8s.pod.uid"
}
}
pod_association {
source {
from = "connection"
}
}
output {
traces = [otelcol.processor.batch.default.input]
}
}
otelcol.processor.batch "default" {
output {
traces = [otelcol.exporter.otlp.tempo.input]
}
}
otelcol.exporter.otlp "tempo" {
client {
endpoint = "grafana-tempo-distributor:4317"
tls {
insecure = true
}
}
}
{{- end }}
extraPorts:
- name: jaeger-compact
port: 6831
protocol: UDP
targetPort: 6831
- name: jaeger-grpc
port: 14250
protocol: TCP
targetPort: 14250
- name: jaeger-thrift
port: 14268
protocol: TCP
targetPort: 14268
- name: metrics
port: 8888
protocol: TCP
targetPort: 8888
- name: otlp
port: 4317
appProtocol: grpc
protocol: TCP
targetPort: 4317
- name: otlp-http
port: 4318
protocol: TCP
targetPort: 4318
- name: zipkin
port: 9411
appProtocol: http/protobuf
protocol: TCP
targetPort: 9411
crds:
create: false
controller:
priorityClassName: monitoring-components
serviceMonitor:
enabled: true
additionalLabels: {{- toYaml .Values.monitoring.labels | nindent 10 }}
{{- end -}}
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,12 @@ prometheusSpec:
- __address__
target_label: cluster
replacement: {{ .Values.global.clusterName }}
{{- if .Values.monitoring.tracing.enabled }}
{{- $telemetryConf := include "common.telemetry.conf" (dict "protocol" "otlp") | fromYaml }}
{{- if $telemetryConf.enabled }}
tracingConfig:
clientType: grpc
samplingFraction: "0.1"
insecure: true
endpoint: open-telemetry-collector-opentelemetry-collector.monitoring:4317
endpoint: {{ printf "%s:%d" $telemetryConf.host $telemetryConf.port }}
{{- end }}
{{- end -}}
Loading

0 comments on commit 24db445

Please sign in to comment.