From d42a5a5831ca7dc40c8a4084d70f0e7e2fb8d5fd Mon Sep 17 00:00:00 2001 From: Povilas Versockas Date: Fri, 2 Feb 2024 15:16:23 +0200 Subject: [PATCH] feat: add tail sampling --- otel-integration/CHANGELOG.md | 4 + otel-integration/k8s-helm/Chart.yaml | 13 +- otel-integration/k8s-helm/README.md | 13 ++ .../k8s-helm/tail-sampling-values.yaml | 63 ++++++++++ otel-integration/k8s-helm/values.yaml | 116 ++++++++++++++++++ 5 files changed, 205 insertions(+), 4 deletions(-) create mode 100644 otel-integration/k8s-helm/tail-sampling-values.yaml diff --git a/otel-integration/CHANGELOG.md b/otel-integration/CHANGELOG.md index ccbaadf7..ad835e87 100644 --- a/otel-integration/CHANGELOG.md +++ b/otel-integration/CHANGELOG.md @@ -2,6 +2,10 @@ ## OpenTelemtry-Integration +### v0.0.52 / 2024-02-05 + +- [FEAT] Optionally allow users to use tail sampling for traces. + ### v0.0.51 / 2024-02-05 - [FIX] Fix Target allocator endpoint slices permission issue. diff --git a/otel-integration/k8s-helm/Chart.yaml b/otel-integration/k8s-helm/Chart.yaml index 4f7e1c41..bfca9f36 100644 --- a/otel-integration/k8s-helm/Chart.yaml +++ b/otel-integration/k8s-helm/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v2 name: otel-integration description: OpenTelemetry Integration -version: 0.0.51 +version: 0.0.52 keywords: - OpenTelemetry Collector - OpenTelemetry Agent @@ -11,19 +11,24 @@ keywords: dependencies: - name: opentelemetry-collector alias: opentelemetry-agent - version: "0.79.3" + version: "0.79.4" repository: https://cgx.jfrog.io/artifactory/coralogix-charts-virtual condition: opentelemetry-agent.enabled - name: opentelemetry-collector alias: opentelemetry-agent-windows - version: "0.79.3" + version: "0.79.4" repository: https://cgx.jfrog.io/artifactory/coralogix-charts-virtual condition: opentelemetry-agent-windows.enabled - name: opentelemetry-collector alias: opentelemetry-cluster-collector - version: "0.79.3" + version: "0.79.4" repository: https://cgx.jfrog.io/artifactory/coralogix-charts-virtual condition: opentelemetry-cluster-collector.enabled + - name: opentelemetry-collector + alias: opentelemetry-gateway + version: "0.79.4" + repository: https://cgx.jfrog.io/artifactory/coralogix-charts-virtual + condition: opentelemetry-gateway.enabled sources: - https://github.com/coralogix/opentelemetry-helm-charts/tree/main/charts/opentelemetry-collector maintainers: diff --git a/otel-integration/k8s-helm/README.md b/otel-integration/k8s-helm/README.md index d935274a..b145a0ea 100644 --- a/otel-integration/k8s-helm/README.md +++ b/otel-integration/k8s-helm/README.md @@ -142,6 +142,19 @@ helm upgrade --install otel-coralogix-integration coralogix-charts-virtual/otel- --render-subchart-notes -f values-crd-override.yaml --set global.clusterName= --set global.domain= ``` +### Enabling Tail Sampling + +If you want to use [Tail Sampling](https://opentelemetry.io/docs/concepts/sampling/#tail-sampling) to reduce the amount of traces using [tail sampling processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/tailsamplingprocessor) you can install `otel-integration` using `tail-sampling-values.yaml` values. For example: + +```bash +helm repo add coralogix-charts-virtual https://cgx.jfrog.io/artifactory/coralogix-charts-virtual + +helm upgrade --install otel-coralogix-integration coralogix-charts-virtual/otel-integration \ + --render-subchart-notes -f tail-sampling-values.yaml +``` + +This change will configure otel-agent pods to send span data to coralogix-opentelemetry-gateway deployment using [loadbalancing exporter](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/loadbalancingexporter). Make sure to configure enough replicas and resource requests and limits to handle the load. Next, you will need to configure [tail sampling processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/tailsamplingprocessor) policies with your custom tail sampling policies. + ### Enabling scraping of Prometheus custom resources (`ServiceMonitor` and `PodMonitor`) If you're leveraging the Prometheus Operator custom resources (`ServiceMonitor` and `PodMonitor`) and you would like to keep using them with the OpenTelemetry collector, you can enable the scraping of these resources by a special, optional component called target allocator. This feature is disabled by default and can be enabled by setting the `opentelemetry-agent.targetAllocator.enabled` value to `true` in the `values.yaml` file. diff --git a/otel-integration/k8s-helm/tail-sampling-values.yaml b/otel-integration/k8s-helm/tail-sampling-values.yaml new file mode 100644 index 00000000..d020e3e3 --- /dev/null +++ b/otel-integration/k8s-helm/tail-sampling-values.yaml @@ -0,0 +1,63 @@ +global: + domain: "" + clusterName: "" + defaultApplicationName: "otel" + defaultSubsystemName: "integration" + logLevel: "warn" + collectionInterval: "30s" + +opentelemetry-agent: + enabled: true + mode: daemonset + presets: + loadBalancing: + enabled: true + routingKey: "traceID" + hostname: coralogix-opentelemetry-gateway + + config: + service: + pipelines: + traces: + exporters: + - loadbalancing + +opentelemetry-gateway: + enabled: true + # For production use-cases please increase replicas + # and resource requests and limits + replicaCount: 3 + # resources: + # requests: + # cpu: 0.5 + # memory: 256Mi + # limits: + # cpu: 2 + # memory: 2G + + config: + processors: + tail_sampling: + # Update configuration here, with your tail sampling policies + # Docs: https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/tailsamplingprocessor + decision_wait: 10s + num_traces: 100 + expected_new_traces_per_sec: 10 + policies: + [ + { + name: errors-policy, + type: status_code, + status_code: {status_codes: [ERROR]} + }, + { + name: randomized-policy, + type: probabilistic, + probabilistic: {sampling_percentage: 25} + }, + ] + +opentelemetry-cluster-collector: + enabled: true +opentelemetry-agent-windows: + enabled: false diff --git a/otel-integration/k8s-helm/values.yaml b/otel-integration/k8s-helm/values.yaml index 39299b3a..cda6d341 100644 --- a/otel-integration/k8s-helm/values.yaml +++ b/otel-integration/k8s-helm/values.yaml @@ -545,3 +545,119 @@ opentelemetry-cluster-collector: opentelemetry-agent-windows: enabled: false +opentelemetry-gateway: + enabled: false + mode: deployment + fullnameOverride: coralogix-opentelemetry-gateway + service: + enabled: true + clusterIP: "None" + extraEnvs: + - name: CORALOGIX_PRIVATE_KEY + valueFrom: + secretKeyRef: + name: coralogix-keys + key: PRIVATE_KEY + + config: + extensions: + zpages: + endpoint: localhost:55679 + pprof: + endpoint: localhost:1777 + exporters: + coralogix: + timeout: "30s" + private_key: "${CORALOGIX_PRIVATE_KEY}" + domain: "{{ .Values.global.domain }}" + application_name: "{{ .Values.global.defaultApplicationName }}" + subsystem_name: "{{ .Values.global.defaultSubsystemName }}" + application_name_attributes: + - "k8s.namespace.name" + - "service.namespace" + subsystem_name_attributes: + - "k8s.deployment.name" + - "k8s.statefulset.name" + - "k8s.daemonset.name" + - "k8s.cronjob.name" + - "service.name" + processors: + tail_sampling: + # Update configuration here, with your tail sampling policies + # Docs: https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/tailsamplingprocessor + decision_wait: 10s + num_traces: 100 + expected_new_traces_per_sec: 10 + policies: + [ + { + name: errors-policy, + type: status_code, + status_code: {status_codes: [ERROR]} + }, + { + name: randomized-policy, + type: probabilistic, + probabilistic: {sampling_percentage: 25} + }, + ] + receivers: + prometheus: + config: + scrape_configs: + - job_name: opentelemetry-collector + scrape_interval: 30s + static_configs: + - targets: + - ${MY_POD_IP}:8888 + otlp: + protocols: + grpc: + endpoint: ${MY_POD_IP}:4317 + service: + telemetry: + resource: + # Supress this attribute, as we don't want the UUID of the collector to be sent, + # instead we rely on instance label generated by Prometheus receiver. + - service.instance.id: + - service.name: + logs: + level: "{{ .Values.global.logLevel }}" + encoding: json + metrics: + address: ${MY_POD_IP}:8888 + pipelines: + metrics: + exporters: + - coralogix + processors: + - memory_limiter + - batch + receivers: + - prometheus + traces: + exporters: + - coralogix + processors: + - memory_limiter + - tail_sampling + - batch + receivers: + - otlp + + + tolerations: + - operator: Exists + ports: + otlp: + enabled: true + otlp-http: + enabled: false + jaeger-compact: + enabled: false + jaeger-thrift: + enabled: false + jaeger-grpc: + enabled: false + zipkin: + enabled: false