From 32a2aadb42af77dee5479e8b22b7cd95b5218f7e Mon Sep 17 00:00:00 2001 From: Mateusz Kulewicz Date: Mon, 7 Oct 2024 14:10:00 +0200 Subject: [PATCH] Tracing downsampling (#319) * Basic configurable sampling policy * Separate sampling policies for workload, charm and error traces * Sampling config generation tests * fmt * Review remarks: simpler testing and better naming --- config.yaml | 26 ++++++- src/grafana_agent.py | 90 ++++++++++++++++++++++ tests/scenario/test_tracing_integration.py | 41 ++++++++++ 3 files changed, 156 insertions(+), 1 deletion(-) diff --git a/config.yaml b/config.yaml index 2ed9b4be..b55ca0e8 100644 --- a/config.yaml +++ b/config.yaml @@ -39,4 +39,28 @@ options: Force-enable the receiver for the 'jaeger_thrift_http' protocol in Grafana Agent, even if there is no integration currently requesting it. type: boolean - default: false \ No newline at end of file + default: false + tracing_sample_rate_charm: + description: > + This property defines the percentage of charm traces that are sent to the tracing backend. + Setting it to 100 would mean all charm traces are kept, setting to 0 means charm traces + aren't sent to the tracing backend at all. Anything outside of 0-100 range will be normalised + to this range by Grafana Agent. + type: float + default: 100.0 + tracing_sample_rate_workload: + description: > + This property defines the percentage of workload traces that are sent to the tracing backend. + Setting it to 100 would mean all workload traces are kept, setting to 0 means workload traces + aren't sent to the tracing backend at all. Anything outside of 0-100 range will be normalised + to this range by Grafana Agent. + type: float + default: 1.0 + tracing_sample_rate_error: + description: > + This property defines the percentage of error traces (from all sources) that are sent to the tracing backend. + Setting it to 100 would mean all error traces are kept, setting to 0 means error traces + aren't sent to the tracing backend at all. Anything outside of 0-100 range will be normalised + to this range by Grafana Agent. + type: float + default: 100.0 diff --git a/src/grafana_agent.py b/src/grafana_agent.py index 64038663..8a591c7c 100644 --- a/src/grafana_agent.py +++ b/src/grafana_agent.py @@ -912,6 +912,94 @@ def _receiver_config(protocol: str): return config + @property + def _tracing_sampling(self) -> Dict[str, Any]: + # policies, as defined by tail sampling processor definition: + # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/tailsamplingprocessor + # each of them is evaluated separately and processor decides whether to pass the trace through or not + # see the description of tail sampling processor above for the full decision tree + return { + "policies": [ + { + "name": "error-traces-policy", + "type": "and", + "and": { + "and_sub_policy": [ + { + "name": "trace-status-policy", + "type": "status_code", + "status_code": {"status_codes": ["ERROR"]}, + # status_code processor is using span_status property of spans within a trace + # see https://opentelemetry.io/docs/concepts/signals/traces/#span-status for reference + }, + { + "name": "probabilistic-policy", + "type": "probabilistic", + "probabilistic": { + "sampling_percentage": self.config.get( + "tracing_sample_rate_error" + ) + }, + }, + ] + }, + }, + { + "name": "charm-traces-policy", + "type": "and", + "and": { + "and_sub_policy": [ + { + "name": "service-name-policy", + "type": "string_attribute", + "string_attribute": { + "key": "service.name", + "values": [".+-charm"], + "enabled_regex_matching": True, + }, + }, + { + "name": "probabilistic-policy", + "type": "probabilistic", + "probabilistic": { + "sampling_percentage": self.config.get( + "tracing_sample_rate_charm" + ) + }, + }, + ] + }, + }, + { + "name": "workload-traces-policy", + "type": "and", + "and": { + "and_sub_policy": [ + { + "name": "service-name-policy", + "type": "string_attribute", + "string_attribute": { + "key": "service.name", + "values": [".+-charm"], + "enabled_regex_matching": True, + "invert_match": True, + }, + }, + { + "name": "probabilistic-policy", + "type": "probabilistic", + "probabilistic": { + "sampling_percentage": self.config.get( + "tracing_sample_rate_workload" + ) + }, + }, + ] + }, + }, + ] + } + @property def _tempo_config(self) -> Dict[str, Union[Any, List[Any]]]: """The tracing section of the config. @@ -921,6 +1009,7 @@ def _tempo_config(self) -> Dict[str, Union[Any, List[Any]]]: """ endpoints = self._tempo_endpoints_with_tls() receivers = self._tracing_receivers + sampling = self._tracing_sampling if not receivers: # pushing a config with an empty receivers section will cause gagent to error out @@ -932,6 +1021,7 @@ def _tempo_config(self) -> Dict[str, Union[Any, List[Any]]]: "name": "tempo", "remote_write": endpoints, "receivers": receivers, + "tail_sampling": sampling, } ] } diff --git a/tests/scenario/test_tracing_integration.py b/tests/scenario/test_tracing_integration.py index f27a7684..9a0b5be5 100644 --- a/tests/scenario/test_tracing_integration.py +++ b/tests/scenario/test_tracing_integration.py @@ -164,3 +164,44 @@ def test_tracing_relation_passthrough_with_force_enable(ctx, base_state, force_e # but we provide all providing_protocols = {r.protocol.name for r in tracing_provider_out.receivers} assert providing_protocols == {"otlp_grpc", "otlp_http"}.union(force_enable) + + +@pytest.mark.parametrize( + "sampling_config", + ( + {}, + { + "tracing_sample_rate_charm": 23.0, + "tracing_sample_rate_workload": 13.13, + "tracing_sample_rate_error": 42.42, + }, + ), +) +def test_tracing_sampling_config_is_present(ctx, base_state, sampling_config): + # GIVEN a tracing relation over the tracing-provider endpoint and one over tracing + tracing_provider = scenario.Relation( + "tracing-provider", + remote_app_data=TracingRequirerAppData(receivers=["otlp_http", "otlp_grpc"]).dump(), + ) + tracing = scenario.Relation( + "tracing", + remote_app_data=TracingProviderAppData( + receivers=[ + Receiver(protocol={"name": "otlp_grpc", "type": "grpc"}, url="http:foo.com:1111") + ] + ).dump(), + ) + + state = base_state.replace(relations=[tracing, tracing_provider], config=sampling_config) + # WHEN we process any setup event for the relation + state_out = ctx.run(tracing.changed_event, state) + + agent = state_out.get_container("agent") + + # THEN the grafana agent config has a traces tail_sampling section with default values + fs = agent.get_filesystem(ctx) + gagent_config = fs.joinpath(*CONFIG_PATH.strip("/").split("/")) + assert gagent_config.exists() + yml = yaml.safe_load(gagent_config.read_text()) + + assert yml["traces"]["configs"][0]["tail_sampling"]