From 32a2aadb42af77dee5479e8b22b7cd95b5218f7e Mon Sep 17 00:00:00 2001
From: Mateusz Kulewicz <mateusz.kulewicz@canonical.com>
Date: Mon, 7 Oct 2024 14:10:00 +0200
Subject: [PATCH] Tracing downsampling (#319)

* Basic configurable sampling policy

* Separate sampling policies for workload, charm and error traces

* Sampling config generation tests

* fmt

* Review remarks: simpler testing and better naming
---
 config.yaml                                | 26 ++++++-
 src/grafana_agent.py                       | 90 ++++++++++++++++++++++
 tests/scenario/test_tracing_integration.py | 41 ++++++++++
 3 files changed, 156 insertions(+), 1 deletion(-)

diff --git a/config.yaml b/config.yaml
index 2ed9b4be..b55ca0e8 100644
--- a/config.yaml
+++ b/config.yaml
@@ -39,4 +39,28 @@ options:
       Force-enable the receiver for the 'jaeger_thrift_http' protocol in Grafana Agent, 
       even if there is no integration currently requesting it.
     type: boolean
-    default: false
\ No newline at end of file
+    default: false
+  tracing_sample_rate_charm:
+    description: >
+      This property defines the percentage of charm traces that are sent to the tracing backend.
+      Setting it to 100 would mean all charm traces are kept, setting to 0 means charm traces
+      aren't sent to the tracing backend at all. Anything outside of 0-100 range will be normalised 
+      to this range by Grafana Agent.
+    type: float
+    default: 100.0
+  tracing_sample_rate_workload:
+    description: >
+      This property defines the percentage of workload traces that are sent to the tracing backend.
+      Setting it to 100 would mean all workload traces are kept, setting to 0 means workload traces
+      aren't sent to the tracing backend at all. Anything outside of 0-100 range will be normalised 
+      to this range by Grafana Agent.
+    type: float
+    default: 1.0
+  tracing_sample_rate_error:
+    description: >
+      This property defines the percentage of error traces (from all sources) that are sent to the tracing backend.
+      Setting it to 100 would mean all error traces are kept, setting to 0 means error traces
+      aren't sent to the tracing backend at all. Anything outside of 0-100 range will be normalised 
+      to this range by Grafana Agent.
+    type: float
+    default: 100.0
diff --git a/src/grafana_agent.py b/src/grafana_agent.py
index 64038663..8a591c7c 100644
--- a/src/grafana_agent.py
+++ b/src/grafana_agent.py
@@ -912,6 +912,94 @@ def _receiver_config(protocol: str):
 
         return config
 
+    @property
+    def _tracing_sampling(self) -> Dict[str, Any]:
+        # policies, as defined by tail sampling processor definition:
+        # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/tailsamplingprocessor
+        # each of them is evaluated separately and processor decides whether to pass the trace through or not
+        # see the description of tail sampling processor above for the full decision tree
+        return {
+            "policies": [
+                {
+                    "name": "error-traces-policy",
+                    "type": "and",
+                    "and": {
+                        "and_sub_policy": [
+                            {
+                                "name": "trace-status-policy",
+                                "type": "status_code",
+                                "status_code": {"status_codes": ["ERROR"]},
+                                # status_code processor is using span_status property of spans within a trace
+                                # see https://opentelemetry.io/docs/concepts/signals/traces/#span-status for reference
+                            },
+                            {
+                                "name": "probabilistic-policy",
+                                "type": "probabilistic",
+                                "probabilistic": {
+                                    "sampling_percentage": self.config.get(
+                                        "tracing_sample_rate_error"
+                                    )
+                                },
+                            },
+                        ]
+                    },
+                },
+                {
+                    "name": "charm-traces-policy",
+                    "type": "and",
+                    "and": {
+                        "and_sub_policy": [
+                            {
+                                "name": "service-name-policy",
+                                "type": "string_attribute",
+                                "string_attribute": {
+                                    "key": "service.name",
+                                    "values": [".+-charm"],
+                                    "enabled_regex_matching": True,
+                                },
+                            },
+                            {
+                                "name": "probabilistic-policy",
+                                "type": "probabilistic",
+                                "probabilistic": {
+                                    "sampling_percentage": self.config.get(
+                                        "tracing_sample_rate_charm"
+                                    )
+                                },
+                            },
+                        ]
+                    },
+                },
+                {
+                    "name": "workload-traces-policy",
+                    "type": "and",
+                    "and": {
+                        "and_sub_policy": [
+                            {
+                                "name": "service-name-policy",
+                                "type": "string_attribute",
+                                "string_attribute": {
+                                    "key": "service.name",
+                                    "values": [".+-charm"],
+                                    "enabled_regex_matching": True,
+                                    "invert_match": True,
+                                },
+                            },
+                            {
+                                "name": "probabilistic-policy",
+                                "type": "probabilistic",
+                                "probabilistic": {
+                                    "sampling_percentage": self.config.get(
+                                        "tracing_sample_rate_workload"
+                                    )
+                                },
+                            },
+                        ]
+                    },
+                },
+            ]
+        }
+
     @property
     def _tempo_config(self) -> Dict[str, Union[Any, List[Any]]]:
         """The tracing section of the config.
@@ -921,6 +1009,7 @@ def _tempo_config(self) -> Dict[str, Union[Any, List[Any]]]:
         """
         endpoints = self._tempo_endpoints_with_tls()
         receivers = self._tracing_receivers
+        sampling = self._tracing_sampling
 
         if not receivers:
             # pushing a config with an empty receivers section will cause gagent to error out
@@ -932,6 +1021,7 @@ def _tempo_config(self) -> Dict[str, Union[Any, List[Any]]]:
                     "name": "tempo",
                     "remote_write": endpoints,
                     "receivers": receivers,
+                    "tail_sampling": sampling,
                 }
             ]
         }
diff --git a/tests/scenario/test_tracing_integration.py b/tests/scenario/test_tracing_integration.py
index f27a7684..9a0b5be5 100644
--- a/tests/scenario/test_tracing_integration.py
+++ b/tests/scenario/test_tracing_integration.py
@@ -164,3 +164,44 @@ def test_tracing_relation_passthrough_with_force_enable(ctx, base_state, force_e
     # but we provide all
     providing_protocols = {r.protocol.name for r in tracing_provider_out.receivers}
     assert providing_protocols == {"otlp_grpc", "otlp_http"}.union(force_enable)
+
+
+@pytest.mark.parametrize(
+    "sampling_config",
+    (
+        {},
+        {
+            "tracing_sample_rate_charm": 23.0,
+            "tracing_sample_rate_workload": 13.13,
+            "tracing_sample_rate_error": 42.42,
+        },
+    ),
+)
+def test_tracing_sampling_config_is_present(ctx, base_state, sampling_config):
+    # GIVEN a tracing relation over the tracing-provider endpoint and one over tracing
+    tracing_provider = scenario.Relation(
+        "tracing-provider",
+        remote_app_data=TracingRequirerAppData(receivers=["otlp_http", "otlp_grpc"]).dump(),
+    )
+    tracing = scenario.Relation(
+        "tracing",
+        remote_app_data=TracingProviderAppData(
+            receivers=[
+                Receiver(protocol={"name": "otlp_grpc", "type": "grpc"}, url="http:foo.com:1111")
+            ]
+        ).dump(),
+    )
+
+    state = base_state.replace(relations=[tracing, tracing_provider], config=sampling_config)
+    # WHEN we process any setup event for the relation
+    state_out = ctx.run(tracing.changed_event, state)
+
+    agent = state_out.get_container("agent")
+
+    # THEN the grafana agent config has a traces tail_sampling section with default values
+    fs = agent.get_filesystem(ctx)
+    gagent_config = fs.joinpath(*CONFIG_PATH.strip("/").split("/"))
+    assert gagent_config.exists()
+    yml = yaml.safe_load(gagent_config.read_text())
+
+    assert yml["traces"]["configs"][0]["tail_sampling"]