From a1b638deca9b9648228af862a528200203b880e3 Mon Sep 17 00:00:00 2001 From: bryn Date: Thu, 3 Oct 2024 13:36:14 +0100 Subject: [PATCH 01/26] Add `experimental_datadog_agent_sampling` This mode will change the behaviour of the router for tracing in the following ways: * Spans are never dropped, instead they are converted to RecordOnly. * Spans that are sent to otlp and datadog exporters will always look like they have been sampled. * The `sampling.priority` attribute is populated on spans. * `psr` is populated in trace state. * `m` is populated in trace state. --- ...datadog_upstream_sampling_decision_test.md | 48 + ...nfiguration__tests__schema_generation.snap | 6 + apollo-router/src/plugins/telemetry/config.rs | 30 +- apollo-router/src/plugins/telemetry/mod.rs | 110 ++- .../src/plugins/telemetry/otel/layer.rs | 4 +- .../src/plugins/telemetry/otel/tracer.rs | 9 +- ....field_instrumentation_sampler.router.yaml | 11 + .../tracing/datadog/agent_sampling.rs | 376 ++++++++ .../tracing/{datadog.rs => datadog/mod.rs} | 36 +- .../tracing/datadog/span_processor.rs | 133 +++ .../datadog_exporter/exporter/model/v05.rs | 21 +- .../telemetry/tracing/datadog_exporter/mod.rs | 189 ++-- .../src/plugins/telemetry/tracing/mod.rs | 8 + .../src/plugins/telemetry/tracing/otlp.rs | 27 +- apollo-router/tests/common.rs | 59 +- apollo-router/tests/integration/mod.rs | 9 + .../tests/integration/telemetry/datadog.rs | 418 ++++++++- .../telemetry/fixtures/datadog.router.yaml | 1 + ...atadog_agent_sampling_disabled.router.yaml | 23 + .../datadog_default_span_names.router.yaml | 1 + .../datadog_no_parent_sampler.router.yaml | 28 + .../fixtures/datadog_no_sample.router.yaml | 1 + .../datadog_override_span_names.router.yaml | 1 + ...tadog_override_span_names_late.router.yaml | 1 + ...tadog_resource_mapping_default.router.yaml | 1 + ...adog_resource_mapping_override.router.yaml | 1 + .../telemetry/fixtures/otlp.router.yaml | 14 +- .../otlp_datadog_agent_no_sample.router.yaml | 42 + .../otlp_datadog_agent_sample.router.yaml | 42 + ...datadog_agent_sample_no_sample.router.yaml | 42 + .../otlp_datadog_propagation.router.yaml | 39 + ...p_datadog_propagation_no_agent.router.yaml | 38 + ..._propagation_no_parent_sampler.router.yaml | 40 + ...request_with_zipkin_propagator.router.yaml | 40 + .../otlp_no_parent_sampler.router.yaml | 25 + .../tests/integration/telemetry/jaeger.rs | 8 +- .../tests/integration/telemetry/otlp.rs | 876 +++++++++++++++--- .../telemetry/exporters/tracing/datadog.mdx | 65 +- 38 files changed, 2490 insertions(+), 333 deletions(-) create mode 100644 .changesets/fix_bryn_datadog_upstream_sampling_decision_test.md create mode 100644 apollo-router/src/plugins/telemetry/testdata/config.field_instrumentation_sampler.router.yaml create mode 100644 apollo-router/src/plugins/telemetry/tracing/datadog/agent_sampling.rs rename apollo-router/src/plugins/telemetry/tracing/{datadog.rs => datadog/mod.rs} (93%) create mode 100644 apollo-router/src/plugins/telemetry/tracing/datadog/span_processor.rs create mode 100644 apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled.router.yaml create mode 100644 apollo-router/tests/integration/telemetry/fixtures/datadog_no_parent_sampler.router.yaml create mode 100644 apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_no_sample.router.yaml create mode 100644 apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_sample.router.yaml create mode 100644 apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_sample_no_sample.router.yaml create mode 100644 apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation.router.yaml create mode 100644 apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation_no_agent.router.yaml create mode 100644 apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation_no_parent_sampler.router.yaml create mode 100644 apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_request_with_zipkin_propagator.router.yaml create mode 100644 apollo-router/tests/integration/telemetry/fixtures/otlp_no_parent_sampler.router.yaml diff --git a/.changesets/fix_bryn_datadog_upstream_sampling_decision_test.md b/.changesets/fix_bryn_datadog_upstream_sampling_decision_test.md new file mode 100644 index 0000000000..bb6447a66b --- /dev/null +++ b/.changesets/fix_bryn_datadog_upstream_sampling_decision_test.md @@ -0,0 +1,48 @@ +### Respect x-datadog-sampling-priority ([PR #6017](https://github.com/apollographql/router/pull/6017)) + +This PR consists of two fixes: +#### Datadog priority sampling resolution is not lost. + +Previously a `x-datadog-sampling-priority` of `-1` would be converted to `0` for downstream requests and `2` would be converted to `1`. + +#### The sampler option in the `telemetry.exporters.tracing.common.sampler` is not datadog aware. + +To get accurate APM metrics all spans must be sent to the datadog agent with a `psr` or `sampling.priority` attribute set appropriately to record the sampling decision. + +`preview_datadog_agent_sampling` option in the router.yaml enables this behavior and should be used when exporting to the datadog agent via OTLP or datadog native. + +```yaml +telemetry: + exporters: + tracing: + common: + # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! + sampler: 0.1 + # Send all spans to the Datadog agent. + preview_datadog_agent_sampling: true + + # Example OTLP exporter configuration + otlp: + enabled: true + # Optional batch processor setting, this will enable the batch processor to send concurrent requests in a high load scenario. + batch_processor: + max_concurrent_exports: 100 + + # Example Datadog native exporter configuration + datadog: + enabled: true + + # Optional batch processor setting, this will enable the batch processor to send concurrent requests in a high load scenario. + batch_processor: + max_concurrent_exports: 100 +``` + +By using these options, you can decrease your Datadog bill as you will only be sending a percentage of spans from the Datadog agent to datadog. + +> [!IMPORTANT] +> Users must enable `preview_datadog_agent_sampling` to get accurate APM metrics. + +> [!IMPORTANT] +> Sending all spans to the datadog agent may require that you tweak the `batch_processor` settings in your exporter config. This applies to both OTLP and the Datadog native exporter. + +By [@BrynCooke](https://github.com/BrynCooke) in https://github.com/apollographql/router/pull/6017 diff --git a/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap b/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap index 4eba0206d0..af25ac3158 100644 --- a/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap +++ b/apollo-router/src/configuration/snapshots/apollo_router__configuration__tests__schema_generation.snap @@ -7306,6 +7306,12 @@ expression: "&schema" "description": "Whether to use parent based sampling", "type": "boolean" }, + "preview_datadog_agent_sampling": { + "default": null, + "description": "Use datadog agent sampling. This means that all spans will be sent to the Datadog agent and the `sampling.priority` attribute will be used to control if the span will then be sent to Datadog", + "nullable": true, + "type": "boolean" + }, "resource": { "additionalProperties": { "$ref": "#/definitions/AttributeValue", diff --git a/apollo-router/src/plugins/telemetry/config.rs b/apollo-router/src/plugins/telemetry/config.rs index 4c9be01135..8dc84e85c0 100644 --- a/apollo-router/src/plugins/telemetry/config.rs +++ b/apollo-router/src/plugins/telemetry/config.rs @@ -24,6 +24,7 @@ use super::*; use crate::plugin::serde::deserialize_option_header_name; use crate::plugins::telemetry::metrics; use crate::plugins::telemetry::resource::ConfigResource; +use crate::plugins::telemetry::tracing::datadog::DatadogAgentSampling; use crate::Configuration; #[derive(thiserror::Error, Debug)] @@ -347,6 +348,9 @@ pub(crate) struct TracingCommon { pub(crate) service_namespace: Option, /// The sampler, always_on, always_off or a decimal between 0.0 and 1.0 pub(crate) sampler: SamplerOption, + /// Use datadog agent sampling. This means that all spans will be sent to the Datadog agent + /// and the `sampling.priority` attribute will be used to control if the span will then be sent to Datadog + pub(crate) preview_datadog_agent_sampling: Option, /// Whether to use parent based sampling pub(crate) parent_based_sampler: bool, /// The maximum events per span before discarding @@ -401,6 +405,7 @@ impl Default for TracingCommon { service_name: Default::default(), service_namespace: Default::default(), sampler: default_sampler(), + preview_datadog_agent_sampling: None, parent_based_sampler: default_parent_based_sampler(), max_events_per_span: default_max_events_per_span(), max_attributes_per_span: default_max_attributes_per_span(), @@ -668,8 +673,15 @@ impl From<&TracingCommon> for opentelemetry::sdk::trace::Config { if config.parent_based_sampler { sampler = parent_based(sampler); } + if config.preview_datadog_agent_sampling.unwrap_or_default() { + common = common.with_sampler(DatadogAgentSampling::new( + sampler, + config.parent_based_sampler, + )); + } else { + common = common.with_sampler(sampler); + } - common = common.with_sampler(sampler); common = common.with_max_events_per_span(config.max_events_per_span); common = common.with_max_attributes_per_span(config.max_attributes_per_span); common = common.with_max_links_per_span(config.max_links_per_span); @@ -688,6 +700,22 @@ fn parent_based(sampler: opentelemetry::sdk::trace::Sampler) -> opentelemetry::s impl Conf { pub(crate) fn calculate_field_level_instrumentation_ratio(&self) -> Result { + // Because when datadog is enabled the global sampling is overriden to always_on + if self + .exporters + .tracing + .common + .preview_datadog_agent_sampling + .unwrap_or_default() + { + let field_ratio = match &self.apollo.field_level_instrumentation_sampler { + SamplerOption::TraceIdRatioBased(ratio) => *ratio, + SamplerOption::Always(Sampler::AlwaysOn) => 1.0, + SamplerOption::Always(Sampler::AlwaysOff) => 0.0, + }; + + return Ok(field_ratio); + } Ok( match ( &self.exporters.tracing.common.sampler, diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index 1478261be5..fa6fa6494f 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -286,6 +286,20 @@ impl Plugin for Telemetry { .expect("otel error handler lock poisoned, fatal"); let mut config = init.config; + // This code would have enabled datadog agent sampling by default, but for now we will leave it as opt-in. + // If the datadog exporter is enabled then enable the agent sampler. + // If users are using otlp export then they will need to set this explicitly in their config. + // + // if config.exporters.tracing.datadog.enabled() + // && config + // .exporters + // .tracing + // .common + // .preview_datadog_agent_sampling + // .is_none() + // { + // config.exporters.tracing.common.preview_datadog_agent_sampling = Some(true); + // } config.instrumentation.spans.update_defaults(); config.instrumentation.instruments.update_defaults(); config.exporters.logging.validate()?; @@ -866,7 +880,21 @@ impl Telemetry { // Only apply things if we were executing in the context of a vanilla the Apollo executable. // Users that are rolling their own routers will need to set up telemetry themselves. if let Some(hot_tracer) = OPENTELEMETRY_TRACER_HANDLE.get() { - otel::layer::configure(&self.sampling_filter_ratio); + // If the datadog agent sampling is enabled, then we cannot presample the spans + // Therefore we set presampling to always on and let the regular sampler do the work. + // Effectively, we are disabling the presampling. + if self + .config + .exporters + .tracing + .common + .preview_datadog_agent_sampling + .unwrap_or_default() + { + otel::layer::configure(&SamplerOption::Always(Sampler::AlwaysOn)); + } else { + otel::layer::configure(&self.sampling_filter_ratio); + } // The reason that this has to happen here is that we are interacting with global state. // If we do this logic during plugin init then if a subsequent plugin fails to init then we @@ -889,7 +917,8 @@ impl Telemetry { Self::checked_global_tracer_shutdown(last_provider); - opentelemetry::global::set_text_map_propagator(Self::create_propagator(&self.config)); + let propagator = Self::create_propagator(&self.config); + opentelemetry::global::set_text_map_propagator(propagator); } activation.reload_metrics(); @@ -934,9 +963,6 @@ impl Telemetry { if propagation.zipkin || tracing.zipkin.enabled { propagators.push(Box::::default()); } - if propagation.datadog || tracing.datadog.enabled() { - propagators.push(Box::::default()); - } if propagation.aws_xray { propagators.push(Box::::default()); } @@ -946,6 +972,9 @@ impl Telemetry { propagation.request.format.clone(), ))); } + if propagation.datadog || tracing.datadog.enabled() { + propagators.push(Box::::default()); + } TextMapCompositePropagator::new(propagators) } @@ -957,9 +986,14 @@ impl Telemetry { let spans_config = &config.instrumentation.spans; let mut common = tracing_config.common.clone(); let mut sampler = common.sampler.clone(); - // set it to AlwaysOn: it is now done in the SamplingFilter, so whatever is sent to an exporter - // should be accepted - common.sampler = SamplerOption::Always(Sampler::AlwaysOn); + + // To enable pre-sampling to work we need to disable regular sampling. + // This is because the pre-sampler will sample the spans before they sent to the regular sampler + // If the datadog agent sampling is enabled, then we cannot pre-sample the spans because even if the sampling decision is made to drop + // DatadogAgentSampler will modify the decision to RecordAndSample and instead use the sampling.priority attribute to decide if the span should be sampled or not. + if !common.preview_datadog_agent_sampling.unwrap_or_default() { + common.sampler = SamplerOption::Always(Sampler::AlwaysOn); + } let mut builder = opentelemetry::sdk::trace::TracerProvider::builder().with_config((&common).into()); @@ -2130,6 +2164,8 @@ mod tests { use std::collections::HashMap; use std::fmt::Debug; use std::ops::DerefMut; + use std::sync::atomic::AtomicUsize; + use std::sync::atomic::Ordering; use std::sync::Arc; use std::sync::Mutex; use std::time::Duration; @@ -2187,6 +2223,7 @@ mod tests { use crate::plugins::demand_control::COST_STRATEGY_KEY; use crate::plugins::telemetry::config::TraceIdFormat; use crate::plugins::telemetry::handle_error_internal; + use crate::plugins::telemetry::EnableSubgraphFtv1; use crate::services::router::body::get_body_bytes; use crate::services::RouterRequest; use crate::services::RouterResponse; @@ -2832,6 +2869,63 @@ mod tests { .await; } + #[tokio::test] + async fn test_field_instrumentation_sampler_with_preview_datadog_agent_sampling() { + let plugin = create_plugin_with_config(include_str!( + "testdata/config.field_instrumentation_sampler.router.yaml" + )) + .await; + + let ftv1_counter = Arc::new(AtomicUsize::new(0)); + let ftv1_counter_cloned = ftv1_counter.clone(); + + let mut mock_request_service = MockSupergraphService::new(); + mock_request_service + .expect_call() + .times(10) + .returning(move |req: SupergraphRequest| { + if req + .context + .extensions() + .with_lock(|lock| lock.contains_key::()) + { + ftv1_counter_cloned.fetch_add(1, Ordering::Relaxed); + } + Ok(SupergraphResponse::fake_builder() + .context(req.context) + .status_code(StatusCode::OK) + .header("content-type", "application/json") + .data(json!({"errors": [{"message": "nope"}]})) + .build() + .unwrap()) + }); + let mut request_supergraph_service = + plugin.supergraph_service(BoxService::new(mock_request_service)); + + for _ in 0..10 { + let supergraph_req = SupergraphRequest::fake_builder() + .header("x-custom", "TEST") + .header("conditional-custom", "X") + .header("custom-length", "55") + .header("content-length", "55") + .header("content-type", "application/graphql") + .query("Query test { me {name} }") + .operation_name("test".to_string()); + let _router_response = request_supergraph_service + .ready() + .await + .unwrap() + .call(supergraph_req.build().unwrap()) + .await + .unwrap() + .next_response() + .await + .unwrap(); + } + // It should be 100% because when we set preview_datadog_agent_sampling, we only take the value of field_level_instrumentation_sampler + assert_eq!(ftv1_counter.load(Ordering::Relaxed), 10); + } + #[tokio::test] async fn test_subgraph_metrics_ok() { async { diff --git a/apollo-router/src/plugins/telemetry/otel/layer.rs b/apollo-router/src/plugins/telemetry/otel/layer.rs index 866bf50a35..86415d2b4d 100644 --- a/apollo-router/src/plugins/telemetry/otel/layer.rs +++ b/apollo-router/src/plugins/telemetry/otel/layer.rs @@ -677,13 +677,13 @@ pub(crate) fn configure(sampler: &SamplerOption) { }, }; - SPAN_SAMPLING_RATE.store(f64::to_bits(ratio), Ordering::Relaxed); + SPAN_SAMPLING_RATE.store(f64::to_bits(ratio), Ordering::SeqCst); } impl OpenTelemetryLayer { fn sample(&self) -> bool { let s: f64 = thread_rng().gen_range(0.0..=1.0); - s <= f64::from_bits(SPAN_SAMPLING_RATE.load(Ordering::Relaxed)) + s <= f64::from_bits(SPAN_SAMPLING_RATE.load(Ordering::SeqCst)) } } diff --git a/apollo-router/src/plugins/telemetry/otel/tracer.rs b/apollo-router/src/plugins/telemetry/otel/tracer.rs index 463fd8cb2c..6b11bab9ad 100644 --- a/apollo-router/src/plugins/telemetry/otel/tracer.rs +++ b/apollo-router/src/plugins/telemetry/otel/tracer.rs @@ -16,7 +16,6 @@ use opentelemetry_sdk::trace::Tracer as SdkTracer; use opentelemetry_sdk::trace::TracerProvider as SdkTracerProvider; use super::OtelData; -use crate::plugins::telemetry::tracing::datadog_exporter::DatadogTraceState; /// An interface for authors of OpenTelemetry SDKs to build pre-sampled tracers. /// @@ -81,6 +80,7 @@ impl PreSampledTracer for SdkTracer { let parent_cx = &data.parent_cx; let builder = &mut data.builder; + // If we have a parent span that means we have a parent span coming from a propagator // Gather trace state let (trace_id, parent_trace_flags) = current_trace_state(builder, parent_cx, &provider); @@ -159,12 +159,7 @@ fn process_sampling_result( decision: SamplingDecision::RecordAndSample, trace_state, .. - } => Some(( - trace_flags | TraceFlags::SAMPLED, - trace_state - .with_priority_sampling(true) - .with_measuring(true), - )), + } => Some((trace_flags | TraceFlags::SAMPLED, trace_state.clone())), } } diff --git a/apollo-router/src/plugins/telemetry/testdata/config.field_instrumentation_sampler.router.yaml b/apollo-router/src/plugins/telemetry/testdata/config.field_instrumentation_sampler.router.yaml new file mode 100644 index 0000000000..54f4167b22 --- /dev/null +++ b/apollo-router/src/plugins/telemetry/testdata/config.field_instrumentation_sampler.router.yaml @@ -0,0 +1,11 @@ +telemetry: + instrumentation: + spans: + mode: spec_compliant + apollo: + field_level_instrumentation_sampler: 1.0 + exporters: + tracing: + common: + preview_datadog_agent_sampling: true + sampler: 0.5 \ No newline at end of file diff --git a/apollo-router/src/plugins/telemetry/tracing/datadog/agent_sampling.rs b/apollo-router/src/plugins/telemetry/tracing/datadog/agent_sampling.rs new file mode 100644 index 0000000000..2fc04e94bd --- /dev/null +++ b/apollo-router/src/plugins/telemetry/tracing/datadog/agent_sampling.rs @@ -0,0 +1,376 @@ +use opentelemetry_api::trace::Link; +use opentelemetry_api::trace::SamplingDecision; +use opentelemetry_api::trace::SamplingResult; +use opentelemetry_api::trace::SpanKind; +use opentelemetry_api::trace::TraceId; +use opentelemetry_api::Key; +use opentelemetry_api::KeyValue; +use opentelemetry_api::OrderMap; +use opentelemetry_api::Value; +use opentelemetry_sdk::trace::ShouldSample; + +use crate::plugins::telemetry::tracing::datadog_exporter::propagator::SamplingPriority; +use crate::plugins::telemetry::tracing::datadog_exporter::DatadogTraceState; + +/// The Datadog Agent Sampler +/// +/// This sampler overrides the sampling decision to ensure that spans are recorded even if they were originally dropped. +/// It performs the following tasks: +/// 1. Ensures the appropriate trace state is set +/// 2. Adds the sampling.priority attribute to the span +/// +/// The sampler can be configured to use parent-based sampling for consistent trace sampling. +/// +#[derive(Debug, Clone)] +pub(crate) struct DatadogAgentSampling { + /// The underlying sampler used for initial sampling decisions + pub(crate) sampler: opentelemetry::sdk::trace::Sampler, + /// Flag to enable parent-based sampling for consistent trace sampling + pub(crate) parent_based_sampler: bool, +} +impl DatadogAgentSampling { + /// Creates a new DatadogAgentSampling instance + /// + /// # Arguments + /// * `sampler` - The underlying sampler to use for initial sampling decisions + /// * `parent_based_sampler` - Whether to use parent-based sampling for consistent trace sampling + pub(crate) fn new( + sampler: opentelemetry::sdk::trace::Sampler, + parent_based_sampler: bool, + ) -> Self { + Self { + sampler, + parent_based_sampler, + } + } +} + +impl ShouldSample for DatadogAgentSampling { + fn should_sample( + &self, + parent_context: Option<&opentelemetry_api::Context>, + trace_id: TraceId, + name: &str, + span_kind: &SpanKind, + attributes: &OrderMap, + links: &[Link], + ) -> SamplingResult { + let mut result = self.sampler.should_sample( + parent_context, + trace_id, + name, + span_kind, + attributes, + links, + ); + // Override the sampling decision to record and make sure that the trace state is set correctly + // if either parent sampling is disabled or it has not been populated by a propagator. + // The propagator gets first dibs on setting the trace state, so if it sets it, we don't override it unless we are not parent based. + match result.decision { + SamplingDecision::Drop | SamplingDecision::RecordOnly => { + result.decision = SamplingDecision::RecordOnly; + if !self.parent_based_sampler || result.trace_state.sampling_priority().is_none() { + result.trace_state = result + .trace_state + .with_priority_sampling(SamplingPriority::AutoReject) + } + } + SamplingDecision::RecordAndSample => { + if !self.parent_based_sampler || result.trace_state.sampling_priority().is_none() { + result.trace_state = result + .trace_state + .with_priority_sampling(SamplingPriority::AutoKeep) + } + } + } + + // We always want to measure + result.trace_state = result.trace_state.with_measuring(true); + // We always want to set the sampling.priority attribute in case we are communicating with the agent via otlp. + // Reverse engineered from https://github.com/DataDog/datadog-agent/blob/c692f62423f93988b008b669008f9199a5ad196b/pkg/trace/api/otlp.go#L502 + result.attributes.push(KeyValue::new( + "sampling.priority", + Value::I64( + result + .trace_state + .sampling_priority() + .expect("sampling priority") + .as_i64(), + ), + )); + result + } +} +#[cfg(test)] +mod tests { + use buildstructor::Builder; + use opentelemetry::sdk::trace::Sampler; + use opentelemetry::trace::TraceState; + use opentelemetry_api::trace::Link; + use opentelemetry_api::trace::SamplingDecision; + use opentelemetry_api::trace::SamplingResult; + use opentelemetry_api::trace::SpanContext; + use opentelemetry_api::trace::SpanId; + use opentelemetry_api::trace::SpanKind; + use opentelemetry_api::trace::TraceContextExt; + use opentelemetry_api::trace::TraceFlags; + use opentelemetry_api::trace::TraceId; + use opentelemetry_api::Context; + use opentelemetry_api::Key; + use opentelemetry_api::OrderMap; + use opentelemetry_api::Value; + use opentelemetry_sdk::trace::ShouldSample; + + use crate::plugins::telemetry::tracing::datadog::DatadogAgentSampling; + use crate::plugins::telemetry::tracing::datadog_exporter::propagator::SamplingPriority; + use crate::plugins::telemetry::tracing::datadog_exporter::DatadogTraceState; + + #[derive(Debug, Clone, Builder)] + struct StubSampler { + decision: SamplingDecision, + } + + impl ShouldSample for StubSampler { + fn should_sample( + &self, + _parent_context: Option<&Context>, + _trace_id: TraceId, + _name: &str, + _span_kind: &SpanKind, + _attributes: &OrderMap, + _links: &[Link], + ) -> SamplingResult { + SamplingResult { + decision: self.decision.clone(), + attributes: Vec::new(), + trace_state: Default::default(), + } + } + } + + #[test] + fn test_should_sample_drop() { + // Test case where the sampling decision is Drop + let sampler = StubSampler::builder() + .decision(SamplingDecision::Drop) + .build(); + let datadog_sampler = + DatadogAgentSampling::new(Sampler::ParentBased(Box::new(sampler)), false); + + let result = datadog_sampler.should_sample( + None, + TraceId::from_u128(1), + "test_span", + &SpanKind::Internal, + &OrderMap::new(), + &[], + ); + + // Verify that the decision is RecordOnly (converted from Drop) + assert_eq!(result.decision, SamplingDecision::RecordOnly); + // Verify that the sampling priority is set to AutoReject + assert_eq!( + result.trace_state.sampling_priority(), + Some(SamplingPriority::AutoReject) + ); + // Verify that the sampling.priority attribute is set correctly + assert!(result + .attributes + .iter() + .any(|kv| kv.key.as_str() == "sampling.priority" + && kv.value == Value::I64(SamplingPriority::AutoReject.as_i64()))); + + // Verify that measuring is enabled + assert!(result.trace_state.measuring_enabled()); + } + + #[test] + fn test_should_sample_record_only() { + let sampler = StubSampler::builder() + .decision(SamplingDecision::RecordOnly) + .build(); + let datadog_sampler = + DatadogAgentSampling::new(Sampler::ParentBased(Box::new(sampler)), false); + + let result = datadog_sampler.should_sample( + None, + TraceId::from_u128(1), + "test_span", + &SpanKind::Internal, + &OrderMap::new(), + &[], + ); + + // Record only should remain as record only + assert_eq!(result.decision, SamplingDecision::RecordOnly); + + // Verify that the sampling priority is set to AutoReject so the trace won't be transmitted to Datadog + assert_eq!( + result.trace_state.sampling_priority(), + Some(SamplingPriority::AutoReject) + ); + assert!(result + .attributes + .iter() + .any(|kv| kv.key.as_str() == "sampling.priority" + && kv.value == Value::I64(SamplingPriority::AutoReject.as_i64()))); + + // Verify that measuring is enabled + assert!(result.trace_state.measuring_enabled()); + } + + #[test] + fn test_should_sample_record_and_sample() { + let sampler = StubSampler::builder() + .decision(SamplingDecision::RecordAndSample) + .build(); + let datadog_sampler = + DatadogAgentSampling::new(Sampler::ParentBased(Box::new(sampler)), false); + + let result = datadog_sampler.should_sample( + None, + TraceId::from_u128(1), + "test_span", + &SpanKind::Internal, + &OrderMap::new(), + &[], + ); + + // Record and sample should remain as record and sample + assert_eq!(result.decision, SamplingDecision::RecordAndSample); + + // Verify that the sampling priority is set to AutoKeep so the trace will be transmitted to Datadog + assert_eq!( + result.trace_state.sampling_priority(), + Some(SamplingPriority::AutoKeep) + ); + assert!(result + .attributes + .iter() + .any(|kv| kv.key.as_str() == "sampling.priority" + && kv.value == Value::I64(SamplingPriority::AutoKeep.as_i64()))); + + // Verify that measuring is enabled + assert!(result.trace_state.measuring_enabled()); + } + + #[test] + fn test_should_sample_with_parent_based_sampler() { + let sampler = StubSampler::builder() + .decision(SamplingDecision::RecordAndSample) + .build(); + + let datadog_sampler = + DatadogAgentSampling::new(Sampler::ParentBased(Box::new(sampler)), true); + + let result = datadog_sampler.should_sample( + Some(&Context::new()), + TraceId::from_u128(1), + "test_span", + &SpanKind::Internal, + &OrderMap::new(), + &[], + ); + + // Record and sample should remain as record and sample + assert_eq!(result.decision, SamplingDecision::RecordAndSample); + + // Verify that the sampling priority is set to AutoKeep so the trace will be transmitted to Datadog + assert_eq!( + result.trace_state.sampling_priority(), + Some(SamplingPriority::AutoKeep) + ); + assert!(result + .attributes + .iter() + .any(|kv| kv.key.as_str() == "sampling.priority" + && kv.value == Value::I64(SamplingPriority::AutoKeep.as_i64()))); + + // Verify that measuring is enabled + assert!(result.trace_state.measuring_enabled()); + } + + #[test] + fn test_trace_state_already_populated_record_and_sample() { + let sampler = StubSampler::builder() + .decision(SamplingDecision::RecordAndSample) + .build(); + + let datadog_sampler = + DatadogAgentSampling::new(Sampler::ParentBased(Box::new(sampler)), true); + + let result = datadog_sampler.should_sample( + Some(&Context::new().with_remote_span_context(SpanContext::new( + TraceId::from_u128(1), + SpanId::from_u64(1), + TraceFlags::SAMPLED, + true, + TraceState::default().with_priority_sampling(SamplingPriority::UserReject), + ))), + TraceId::from_u128(1), + "test_span", + &SpanKind::Internal, + &OrderMap::new(), + &[], + ); + + // Record and sample should remain as record and sample + assert_eq!(result.decision, SamplingDecision::RecordAndSample); + + // Verify that the sampling priority is not overridden + assert_eq!( + result.trace_state.sampling_priority(), + Some(SamplingPriority::UserReject) + ); + assert!(result + .attributes + .iter() + .any(|kv| kv.key.as_str() == "sampling.priority" + && kv.value == Value::I64(SamplingPriority::UserReject.as_i64()))); + + // Verify that measuring is enabled + assert!(result.trace_state.measuring_enabled()); + } + + #[test] + fn test_trace_state_already_populated_record_drop() { + let sampler = StubSampler::builder() + .decision(SamplingDecision::Drop) + .build(); + + let datadog_sampler = + DatadogAgentSampling::new(Sampler::ParentBased(Box::new(sampler)), true); + + let result = datadog_sampler.should_sample( + Some(&Context::new().with_remote_span_context(SpanContext::new( + TraceId::from_u128(1), + SpanId::from_u64(1), + TraceFlags::default(), + true, + TraceState::default().with_priority_sampling(SamplingPriority::UserReject), + ))), + TraceId::from_u128(1), + "test_span", + &SpanKind::Internal, + &OrderMap::new(), + &[], + ); + + // Drop is converted to RecordOnly + assert_eq!(result.decision, SamplingDecision::RecordOnly); + + // Verify that the sampling priority is not overridden + assert_eq!( + result.trace_state.sampling_priority(), + Some(SamplingPriority::UserReject) + ); + assert!(result + .attributes + .iter() + .any(|kv| kv.key.as_str() == "sampling.priority" + && kv.value == Value::I64(SamplingPriority::UserReject.as_i64()))); + + // Verify that measuring is enabled + assert!(result.trace_state.measuring_enabled()); + } +} diff --git a/apollo-router/src/plugins/telemetry/tracing/datadog.rs b/apollo-router/src/plugins/telemetry/tracing/datadog/mod.rs similarity index 93% rename from apollo-router/src/plugins/telemetry/tracing/datadog.rs rename to apollo-router/src/plugins/telemetry/tracing/datadog/mod.rs index 4574b529ff..66fc09f108 100644 --- a/apollo-router/src/plugins/telemetry/tracing/datadog.rs +++ b/apollo-router/src/plugins/telemetry/tracing/datadog/mod.rs @@ -1,15 +1,18 @@ //! Configuration for datadog tracing. +mod agent_sampling; +mod span_processor; + use std::fmt::Debug; use std::fmt::Formatter; use std::time::Duration; +pub(crate) use agent_sampling::DatadogAgentSampling; use ahash::HashMap; use ahash::HashMapExt; use futures::future::BoxFuture; use http::Uri; use opentelemetry::sdk; -use opentelemetry::sdk::trace::BatchSpanProcessor; use opentelemetry::sdk::trace::Builder; use opentelemetry::Value; use opentelemetry_api::trace::SpanContext; @@ -23,6 +26,7 @@ use opentelemetry_semantic_conventions::resource::SERVICE_NAME; use opentelemetry_semantic_conventions::resource::SERVICE_VERSION; use schemars::JsonSchema; use serde::Deserialize; +pub(crate) use span_processor::DatadogSpanProcessor; use tower::BoxError; use crate::plugins::telemetry::config::GenericWith; @@ -210,18 +214,24 @@ impl TracingConfigurator for Config { let mut span_metrics = default_span_metrics(); span_metrics.extend(self.span_metrics.clone()); - Ok(builder.with_span_processor( - BatchSpanProcessor::builder( - ExporterWrapper { - delegate: exporter, - span_metrics, - }, - opentelemetry::runtime::Tokio, - ) - .with_batch_config(self.batch_processor.clone().into()) - .build() - .filtered(), - )) + let batch_processor = opentelemetry::sdk::trace::BatchSpanProcessor::builder( + ExporterWrapper { + delegate: exporter, + span_metrics, + }, + opentelemetry::runtime::Tokio, + ) + .with_batch_config(self.batch_processor.clone().into()) + .build() + .filtered(); + + Ok( + if trace.preview_datadog_agent_sampling.unwrap_or_default() { + builder.with_span_processor(batch_processor.always_sampled()) + } else { + builder.with_span_processor(batch_processor) + }, + ) } } diff --git a/apollo-router/src/plugins/telemetry/tracing/datadog/span_processor.rs b/apollo-router/src/plugins/telemetry/tracing/datadog/span_processor.rs new file mode 100644 index 0000000000..7c879c310a --- /dev/null +++ b/apollo-router/src/plugins/telemetry/tracing/datadog/span_processor.rs @@ -0,0 +1,133 @@ +use opentelemetry_api::trace::SpanContext; +use opentelemetry_api::trace::TraceResult; +use opentelemetry_api::Context; +use opentelemetry_sdk::export::trace::SpanData; +use opentelemetry_sdk::trace::Span; +use opentelemetry_sdk::trace::SpanProcessor; + +/// When using the Datadog agent we need spans to always be exported. However, the batch span processor will only export spans that are sampled. +/// This wrapper will override the trace flags to always sample. +/// THe datadog exporter itself will look at the `sampling.priority` trace context attribute to determine if the span should be sampled. +#[derive(Debug)] +pub(crate) struct DatadogSpanProcessor { + delegate: T, +} + +impl DatadogSpanProcessor { + pub(crate) fn new(delegate: T) -> Self { + Self { delegate } + } +} + +impl SpanProcessor for DatadogSpanProcessor { + fn on_start(&self, span: &mut Span, cx: &Context) { + self.delegate.on_start(span, cx) + } + + fn on_end(&self, mut span: SpanData) { + // Note that the trace state for measuring and sampling priority is handled in the AgentSampler + // The only purpose of this span processor is to ensure that a span can pass through a batch processor. + let new_trace_flags = span.span_context.trace_flags().with_sampled(true); + span.span_context = SpanContext::new( + span.span_context.trace_id(), + span.span_context.span_id(), + new_trace_flags, + span.span_context.is_remote(), + span.span_context.trace_state().clone(), + ); + self.delegate.on_end(span) + } + + fn force_flush(&self) -> TraceResult<()> { + self.delegate.force_flush() + } + + fn shutdown(&mut self) -> TraceResult<()> { + self.delegate.shutdown() + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + use std::sync::Mutex; + use std::time::SystemTime; + + use opentelemetry_api::trace::SpanId; + use opentelemetry_api::trace::SpanKind; + use opentelemetry_api::trace::TraceFlags; + use opentelemetry_api::trace::TraceId; + use opentelemetry_api::Context; + use opentelemetry_sdk::trace::EvictedHashMap; + use opentelemetry_sdk::trace::EvictedQueue; + use opentelemetry_sdk::trace::SpanProcessor; + + use super::*; + + #[derive(Debug, Clone)] + struct MockSpanProcessor { + spans: Arc>>, + } + + impl MockSpanProcessor { + fn new() -> Self { + Self { + spans: Default::default(), + } + } + } + + impl SpanProcessor for MockSpanProcessor { + fn on_start(&self, _span: &mut Span, _cx: &Context) {} + + fn on_end(&self, span: SpanData) { + self.spans.lock().unwrap().push(span); + } + + fn force_flush(&self) -> TraceResult<()> { + Ok(()) + } + + fn shutdown(&mut self) -> TraceResult<()> { + Ok(()) + } + } + + #[test] + fn test_on_end_updates_trace_flags() { + let mock_processor = MockSpanProcessor::new(); + let processor = DatadogSpanProcessor::new(mock_processor.clone()); + let span_context = SpanContext::new( + TraceId::from_u128(1), + SpanId::from_u64(1), + TraceFlags::default(), + false, + Default::default(), + ); + let span_data = SpanData { + span_context, + parent_span_id: SpanId::from_u64(1), + span_kind: SpanKind::Client, + name: Default::default(), + start_time: SystemTime::now(), + end_time: SystemTime::now(), + attributes: EvictedHashMap::new(32, 32), + events: EvictedQueue::new(32), + links: EvictedQueue::new(32), + status: Default::default(), + resource: Default::default(), + instrumentation_lib: Default::default(), + }; + + processor.on_end(span_data.clone()); + + // Verify that the trace flags are updated to sampled + let updated_trace_flags = span_data.span_context.trace_flags().with_sampled(true); + let stored_spans = mock_processor.spans.lock().unwrap(); + assert_eq!(stored_spans.len(), 1); + assert_eq!( + stored_spans[0].span_context.trace_flags(), + updated_trace_flags + ); + } +} diff --git a/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/exporter/model/v05.rs b/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/exporter/model/v05.rs index fd1590966e..e11bc9ed78 100644 --- a/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/exporter/model/v05.rs +++ b/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/exporter/model/v05.rs @@ -8,6 +8,7 @@ use super::unified_tags::UnifiedTags; use crate::plugins::telemetry::tracing::datadog_exporter::exporter::intern::StringInterner; use crate::plugins::telemetry::tracing::datadog_exporter::exporter::model::DD_MEASURED_KEY; use crate::plugins::telemetry::tracing::datadog_exporter::exporter::model::SAMPLING_PRIORITY_KEY; +use crate::plugins::telemetry::tracing::datadog_exporter::propagator::SamplingPriority; use crate::plugins::telemetry::tracing::datadog_exporter::DatadogTraceState; use crate::plugins::telemetry::tracing::datadog_exporter::Error; use crate::plugins::telemetry::tracing::datadog_exporter::ModelConfig; @@ -129,10 +130,22 @@ fn write_unified_tag<'a>( } fn get_sampling_priority(span: &SpanData) -> f64 { - if span.span_context.trace_state().priority_sampling_enabled() { - 1.0 - } else { - 0.0 + match span + .span_context + .trace_state() + .sampling_priority() + .unwrap_or_else(|| { + // Datadog sampling has not been set, revert to traceflags + if span.span_context.trace_flags().is_sampled() { + SamplingPriority::AutoKeep + } else { + SamplingPriority::AutoReject + } + }) { + SamplingPriority::UserReject => -1.0, + SamplingPriority::AutoReject => 0.0, + SamplingPriority::AutoKeep => 1.0, + SamplingPriority::UserKeep => 2.0, } } diff --git a/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/mod.rs b/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/mod.rs index 1c586d48c8..74907ee6a4 100644 --- a/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/mod.rs +++ b/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/mod.rs @@ -158,6 +158,8 @@ pub use propagator::DatadogTraceState; pub use propagator::DatadogTraceStateBuilder; pub(crate) mod propagator { + use std::fmt::Display; + use once_cell::sync::Lazy; use opentelemetry::propagation::text_map_propagator::FieldIter; use opentelemetry::propagation::Extractor; @@ -177,9 +179,9 @@ pub(crate) mod propagator { const TRACE_FLAG_DEFERRED: TraceFlags = TraceFlags::new(0x02); const TRACE_STATE_PRIORITY_SAMPLING: &str = "psr"; - pub(crate) const TRACE_STATE_MEASURE: &str = "m"; - pub(crate) const TRACE_STATE_TRUE_VALUE: &str = "1"; - pub(crate) const TRACE_STATE_FALSE_VALUE: &str = "0"; + const TRACE_STATE_MEASURE: &str = "m"; + const TRACE_STATE_TRUE_VALUE: &str = "1"; + const TRACE_STATE_FALSE_VALUE: &str = "0"; static DATADOG_HEADER_FIELDS: Lazy<[String; 3]> = Lazy::new(|| { [ @@ -191,8 +193,8 @@ pub(crate) mod propagator { #[derive(Default)] pub struct DatadogTraceStateBuilder { - priority_sampling: bool, - measuring: bool, + sampling_priority: SamplingPriority, + measuring: Option, } fn boolean_to_trace_state_flag(value: bool) -> &'static str { @@ -209,33 +211,39 @@ pub(crate) mod propagator { #[allow(clippy::needless_update)] impl DatadogTraceStateBuilder { - pub fn with_priority_sampling(self, enabled: bool) -> Self { + pub fn with_priority_sampling(self, sampling_priority: SamplingPriority) -> Self { Self { - priority_sampling: enabled, + sampling_priority, ..self } } pub fn with_measuring(self, enabled: bool) -> Self { Self { - measuring: enabled, + measuring: Some(enabled), ..self } } pub fn build(self) -> TraceState { - let values = [ - ( - TRACE_STATE_MEASURE, - boolean_to_trace_state_flag(self.measuring), - ), - ( + if let Some(measuring) = self.measuring { + let values = [ + (TRACE_STATE_MEASURE, boolean_to_trace_state_flag(measuring)), + ( + TRACE_STATE_PRIORITY_SAMPLING, + &self.sampling_priority.to_string(), + ), + ]; + + TraceState::from_key_value(values).unwrap_or_default() + } else { + let values = [( TRACE_STATE_PRIORITY_SAMPLING, - boolean_to_trace_state_flag(self.priority_sampling), - ), - ]; + &self.sampling_priority.to_string(), + )]; - TraceState::from_key_value(values).unwrap_or_default() + TraceState::from_key_value(values).unwrap_or_default() + } } } @@ -244,9 +252,9 @@ pub(crate) mod propagator { fn measuring_enabled(&self) -> bool; - fn with_priority_sampling(&self, enabled: bool) -> TraceState; + fn with_priority_sampling(&self, sampling_priority: SamplingPriority) -> TraceState; - fn priority_sampling_enabled(&self) -> bool; + fn sampling_priority(&self) -> Option; } impl DatadogTraceState for TraceState { @@ -261,30 +269,77 @@ pub(crate) mod propagator { .unwrap_or_default() } - fn with_priority_sampling(&self, enabled: bool) -> TraceState { - self.insert( - TRACE_STATE_PRIORITY_SAMPLING, - boolean_to_trace_state_flag(enabled), - ) - .unwrap_or_else(|_err| self.clone()) + fn with_priority_sampling(&self, sampling_priority: SamplingPriority) -> TraceState { + self.insert(TRACE_STATE_PRIORITY_SAMPLING, sampling_priority.to_string()) + .unwrap_or_else(|_err| self.clone()) } - fn priority_sampling_enabled(&self) -> bool { - self.get(TRACE_STATE_PRIORITY_SAMPLING) - .map(trace_flag_to_boolean) - .unwrap_or_default() + fn sampling_priority(&self) -> Option { + self.get(TRACE_STATE_PRIORITY_SAMPLING).map(|value| { + SamplingPriority::try_from(value).unwrap_or(SamplingPriority::AutoReject) + }) } } - enum SamplingPriority { + #[derive(Default, Debug, Eq, PartialEq)] + pub(crate) enum SamplingPriority { UserReject = -1, + #[default] AutoReject = 0, AutoKeep = 1, UserKeep = 2, } + impl SamplingPriority { + pub(crate) fn as_i64(&self) -> i64 { + match self { + SamplingPriority::UserReject => -1, + SamplingPriority::AutoReject => 0, + SamplingPriority::AutoKeep => 1, + SamplingPriority::UserKeep => 2, + } + } + } + + impl Display for SamplingPriority { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let value = match self { + SamplingPriority::UserReject => -1, + SamplingPriority::AutoReject => 0, + SamplingPriority::AutoKeep => 1, + SamplingPriority::UserKeep => 2, + }; + write!(f, "{}", value) + } + } + + impl SamplingPriority { + pub fn as_str(&self) -> &'static str { + match self { + SamplingPriority::UserReject => "-1", + SamplingPriority::AutoReject => "0", + SamplingPriority::AutoKeep => "1", + SamplingPriority::UserKeep => "2", + } + } + } + + impl TryFrom<&str> for SamplingPriority { + type Error = ExtractError; + + fn try_from(value: &str) -> Result { + match value { + "-1" => Ok(SamplingPriority::UserReject), + "0" => Ok(SamplingPriority::AutoReject), + "1" => Ok(SamplingPriority::AutoKeep), + "2" => Ok(SamplingPriority::UserKeep), + _ => Err(ExtractError::SamplingPriority), + } + } + } + #[derive(Debug)] - enum ExtractError { + pub(crate) enum ExtractError { TraceId, SpanId, SamplingPriority, @@ -311,16 +366,7 @@ pub(crate) mod propagator { } fn create_trace_state_and_flags(trace_flags: TraceFlags) -> (TraceState, TraceFlags) { - if trace_flags & TRACE_FLAG_DEFERRED == TRACE_FLAG_DEFERRED { - (TraceState::default(), trace_flags) - } else { - ( - DatadogTraceStateBuilder::default() - .with_priority_sampling(trace_flags.is_sampled()) - .build(), - TraceFlags::SAMPLED, - ) - } + (TraceState::default(), trace_flags) } impl DatadogPropagator { @@ -343,23 +389,6 @@ pub(crate) mod propagator { .map_err(|_| ExtractError::SpanId) } - fn extract_sampling_priority( - &self, - sampling_priority: &str, - ) -> Result { - let i = sampling_priority - .parse::() - .map_err(|_| ExtractError::SamplingPriority)?; - - match i { - -1 => Ok(SamplingPriority::UserReject), - 0 => Ok(SamplingPriority::AutoReject), - 1 => Ok(SamplingPriority::AutoKeep), - 2 => Ok(SamplingPriority::UserKeep), - _ => Err(ExtractError::SamplingPriority), - } - } - fn extract_span_context( &self, extractor: &dyn Extractor, @@ -371,11 +400,11 @@ pub(crate) mod propagator { let span_id = self .extract_span_id(extractor.get(DATADOG_PARENT_ID_HEADER).unwrap_or("")) .unwrap_or(SpanId::INVALID); - let sampling_priority = self.extract_sampling_priority( - extractor - .get(DATADOG_SAMPLING_PRIORITY_HEADER) - .unwrap_or(""), - ); + let sampling_priority = extractor + .get(DATADOG_SAMPLING_PRIORITY_HEADER) + .unwrap_or("") + .try_into(); + let sampled = match sampling_priority { Ok(SamplingPriority::UserReject) | Ok(SamplingPriority::AutoReject) => { TraceFlags::default() @@ -387,7 +416,10 @@ pub(crate) mod propagator { Err(_) => TRACE_FLAG_DEFERRED, }; - let (trace_state, trace_flags) = create_trace_state_and_flags(sampled); + let (mut trace_state, trace_flags) = create_trace_state_and_flags(sampled); + if let Ok(sampling_priority) = sampling_priority { + trace_state = trace_state.with_priority_sampling(sampling_priority); + } Ok(SpanContext::new( trace_id, @@ -399,14 +431,6 @@ pub(crate) mod propagator { } } - fn get_sampling_priority(span_context: &SpanContext) -> SamplingPriority { - if span_context.trace_state().priority_sampling_enabled() { - SamplingPriority::AutoKeep - } else { - SamplingPriority::AutoReject - } - } - impl TextMapPropagator for DatadogPropagator { fn inject_context(&self, cx: &Context, injector: &mut dyn Injector) { let span = cx.span(); @@ -422,8 +446,11 @@ pub(crate) mod propagator { ); if span_context.trace_flags() & TRACE_FLAG_DEFERRED != TRACE_FLAG_DEFERRED { - let sampling_priority = get_sampling_priority(span_context); - + // The sampling priority + let sampling_priority = span_context + .trace_state() + .sampling_priority() + .unwrap_or_default(); injector.set( DATADOG_SAMPLING_PRIORITY_HEADER, (sampling_priority as i32).to_string(), @@ -460,8 +487,10 @@ pub(crate) mod propagator { (vec![(DATADOG_TRACE_ID_HEADER, "garbage")], SpanContext::empty_context()), (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "garbage")], SpanContext::new(TraceId::from_u128(1234), SpanId::INVALID, TRACE_FLAG_DEFERRED, true, TraceState::default())), (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TRACE_FLAG_DEFERRED, true, TraceState::default())), - (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "0")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::SAMPLED, true, DatadogTraceStateBuilder::default().with_priority_sampling(false).build())), - (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "1")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::SAMPLED, true, DatadogTraceStateBuilder::default().with_priority_sampling(true).build())), + (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "-1")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::default(), true, DatadogTraceStateBuilder::default().with_priority_sampling(SamplingPriority::UserReject).build())), + (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "0")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::default(), true, DatadogTraceStateBuilder::default().with_priority_sampling(SamplingPriority::AutoReject).build())), + (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "1")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::SAMPLED, true, DatadogTraceStateBuilder::default().with_priority_sampling(SamplingPriority::AutoKeep).build())), + (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "2")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::SAMPLED, true, DatadogTraceStateBuilder::default().with_priority_sampling(SamplingPriority::UserKeep).build())), ] } @@ -473,8 +502,10 @@ pub(crate) mod propagator { (vec![], SpanContext::new(TraceId::from_hex("1234").unwrap(), SpanId::INVALID, TRACE_FLAG_DEFERRED, true, TraceState::default())), (vec![], SpanContext::new(TraceId::from_hex("1234").unwrap(), SpanId::INVALID, TraceFlags::SAMPLED, true, TraceState::default())), (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TRACE_FLAG_DEFERRED, true, TraceState::default())), - (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "0")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::SAMPLED, true, DatadogTraceStateBuilder::default().with_priority_sampling(false).build())), - (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "1")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::SAMPLED, true, DatadogTraceStateBuilder::default().with_priority_sampling(true).build())), + (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "-1")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::default(), true, DatadogTraceStateBuilder::default().with_priority_sampling(SamplingPriority::UserReject).build())), + (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "0")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::default(), true, DatadogTraceStateBuilder::default().with_priority_sampling(SamplingPriority::AutoReject).build())), + (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "1")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::SAMPLED, true, DatadogTraceStateBuilder::default().with_priority_sampling(SamplingPriority::AutoKeep).build())), + (vec![(DATADOG_TRACE_ID_HEADER, "1234"), (DATADOG_PARENT_ID_HEADER, "12"), (DATADOG_SAMPLING_PRIORITY_HEADER, "2")], SpanContext::new(TraceId::from_u128(1234), SpanId::from_u64(12), TraceFlags::SAMPLED, true, DatadogTraceStateBuilder::default().with_priority_sampling(SamplingPriority::UserKeep).build())), ] } diff --git a/apollo-router/src/plugins/telemetry/tracing/mod.rs b/apollo-router/src/plugins/telemetry/tracing/mod.rs index 0172f3e094..d2dc62b138 100644 --- a/apollo-router/src/plugins/telemetry/tracing/mod.rs +++ b/apollo-router/src/plugins/telemetry/tracing/mod.rs @@ -18,6 +18,7 @@ use tower::BoxError; use super::config_new::spans::Spans; use super::formatters::APOLLO_PRIVATE_PREFIX; use crate::plugins::telemetry::config::TracingCommon; +use crate::plugins::telemetry::tracing::datadog::DatadogSpanProcessor; pub(crate) mod apollo; pub(crate) mod apollo_telemetry; @@ -91,6 +92,7 @@ where Self: Sized + SpanProcessor, { fn filtered(self) -> ApolloFilterSpanProcessor; + fn always_sampled(self) -> DatadogSpanProcessor; } impl SpanProcessorExt for T @@ -100,6 +102,12 @@ where fn filtered(self) -> ApolloFilterSpanProcessor { ApolloFilterSpanProcessor { delegate: self } } + + /// This span processor will always send spans to the exporter even if they are not sampled. This is useful for the datadog agent which + /// uses spans for metrics. + fn always_sampled(self) -> DatadogSpanProcessor { + DatadogSpanProcessor::new(self) + } } /// Batch processor configuration diff --git a/apollo-router/src/plugins/telemetry/tracing/otlp.rs b/apollo-router/src/plugins/telemetry/tracing/otlp.rs index be294427f2..9a61075e5f 100644 --- a/apollo-router/src/plugins/telemetry/tracing/otlp.rs +++ b/apollo-router/src/plugins/telemetry/tracing/otlp.rs @@ -20,20 +20,23 @@ impl TracingConfigurator for super::super::otlp::Config { fn apply( &self, builder: Builder, - _common: &TracingCommon, + common: &TracingCommon, _spans_config: &Spans, ) -> Result { - tracing::info!("Configuring Otlp tracing: {}", self.batch_processor); let exporter: SpanExporterBuilder = self.exporter(TelemetryDataKind::Traces)?; - - Ok(builder.with_span_processor( - BatchSpanProcessor::builder( - exporter.build_span_exporter()?, - opentelemetry::runtime::Tokio, - ) - .with_batch_config(self.batch_processor.clone().into()) - .build() - .filtered(), - )) + let batch_span_processor = BatchSpanProcessor::builder( + exporter.build_span_exporter()?, + opentelemetry::runtime::Tokio, + ) + .with_batch_config(self.batch_processor.clone().into()) + .build() + .filtered(); + Ok( + if common.preview_datadog_agent_sampling.unwrap_or_default() { + builder.with_span_processor(batch_span_processor.always_sampled()) + } else { + builder.with_span_processor(batch_span_processor) + }, + ) } } diff --git a/apollo-router/tests/common.rs b/apollo-router/tests/common.rs index 826a377e04..3c222ba3d6 100644 --- a/apollo-router/tests/common.rs +++ b/apollo-router/tests/common.rs @@ -4,6 +4,7 @@ use std::net::SocketAddr; use std::net::TcpListener; use std::path::PathBuf; use std::process::Stdio; +use std::str::FromStr; use std::sync::Arc; use std::sync::Mutex; use std::time::Duration; @@ -18,6 +19,7 @@ use fred::types::Scanner; use futures::StreamExt; use http::header::ACCEPT; use http::header::CONTENT_TYPE; +use http::HeaderName; use http::HeaderValue; use mediatype::names::BOUNDARY; use mediatype::names::FORM_DATA; @@ -33,6 +35,7 @@ use opentelemetry::sdk::trace::TracerProvider; use opentelemetry::sdk::Resource; use opentelemetry::testing::trace::NoopSpanExporter; use opentelemetry::trace::TraceContextExt; +use opentelemetry_api::trace::SpanContext; use opentelemetry_api::trace::TraceId; use opentelemetry_api::trace::TracerProvider as OtherTracerProvider; use opentelemetry_api::Context; @@ -126,7 +129,7 @@ impl Respond for TracedResponder { pub enum Telemetry { Jaeger, Otlp { - endpoint: String, + endpoint: Option, }, Datadog, Zipkin, @@ -156,7 +159,9 @@ impl Telemetry { .build(), ) .build(), - Telemetry::Otlp { endpoint } => TracerProvider::builder() + Telemetry::Otlp { + endpoint: Some(endpoint), + } => TracerProvider::builder() .with_config(config) .with_span_processor( BatchSpanProcessor::builder( @@ -201,7 +206,7 @@ impl Telemetry { .build(), ) .build(), - Telemetry::None => TracerProvider::builder() + Telemetry::None | Telemetry::Otlp { endpoint: None } => TracerProvider::builder() .with_config(config) .with_simple_exporter(NoopSpanExporter::default()) .build(), @@ -258,7 +263,29 @@ impl Telemetry { } Telemetry::Datadog => { let propagator = opentelemetry_datadog::DatadogPropagator::new(); - propagator.extract(&headers) + let mut context = propagator.extract(&headers); + // We're going to override the sampled so that we can test sampling priority + if let Some(psr) = headers.get("x-datadog-sampling-priority") { + let state = context + .span() + .span_context() + .trace_state() + .insert("psr", psr.to_string()) + .expect("psr"); + context = context.with_remote_span_context(SpanContext::new( + context.span().span_context().trace_id(), + context.span().span_context().span_id(), + context + .span() + .span_context() + .trace_flags() + .with_sampled(true), + true, + state, + )); + } + + context } Telemetry::Otlp { .. } => { let propagator = opentelemetry::sdk::propagation::TraceContextPropagator::default(); @@ -568,7 +595,7 @@ impl IntegrationTest { async move { let client = reqwest::Client::new(); - let mut builder = client + let builder = client .post(url) .header( CONTENT_TYPE, @@ -579,14 +606,19 @@ impl IntegrationTest { .header("x-my-header", "test") .header("head", "test"); + let mut request = builder.json(&query).build().unwrap(); + telemetry.inject_context(&mut request); + if let Some(headers) = headers { for (name, value) in headers { - builder = builder.header(name, value); + request.headers_mut().remove(&name); + request.headers_mut().append( + HeaderName::from_str(&name).expect("header was invalid"), + value.try_into().expect("header was invalid"), + ); } } - let mut request = builder.json(&query).build().unwrap(); - telemetry.inject_context(&mut request); request.headers_mut().remove(ACCEPT); match client.execute(request).await { Ok(response) => (span_id, response), @@ -605,6 +637,7 @@ impl IntegrationTest { pub fn execute_untraced_query( &self, query: &Value, + headers: Option>, ) -> impl std::future::Future { assert!( self.router.is_some(), @@ -626,6 +659,16 @@ impl IntegrationTest { .unwrap(); request.headers_mut().remove(ACCEPT); + if let Some(headers) = headers { + for (name, value) in headers { + request.headers_mut().remove(&name); + request.headers_mut().append( + HeaderName::from_str(&name).expect("header was invalid"), + value.try_into().expect("header was invalid"), + ); + } + } + match client.execute(request).await { Ok(response) => ( TraceId::from_hex( diff --git a/apollo-router/tests/integration/mod.rs b/apollo-router/tests/integration/mod.rs index c383b5348f..06b77f688f 100644 --- a/apollo-router/tests/integration/mod.rs +++ b/apollo-router/tests/integration/mod.rs @@ -39,3 +39,12 @@ impl ValueExt for Value { self.as_str().map(|s| s.to_string()) } } + +impl ValueExt for &Value { + fn select_path<'a>(&'a self, path: &str) -> Result, BoxError> { + Ok(Selector::new().str_path(path)?.value(self).select()?) + } + fn as_string(&self) -> Option { + self.as_str().map(|s| s.to_string()) + } +} diff --git a/apollo-router/tests/integration/telemetry/datadog.rs b/apollo-router/tests/integration/telemetry/datadog.rs index 6aed76ff6d..39757ee389 100644 --- a/apollo-router/tests/integration/telemetry/datadog.rs +++ b/apollo-router/tests/integration/telemetry/datadog.rs @@ -2,17 +2,18 @@ extern crate core; use std::collections::HashMap; use std::collections::HashSet; -use std::sync::atomic::AtomicBool; +use std::sync::Arc; +use std::sync::Mutex; use std::time::Duration; use anyhow::anyhow; +use opentelemetry_api::trace::SpanContext; use opentelemetry_api::trace::TraceContextExt; use opentelemetry_api::trace::TraceId; +use opentelemetry_api::Context; use serde_json::json; use serde_json::Value; use tower::BoxError; -use tracing::Span; -use tracing_opentelemetry::OpenTelemetrySpanExt; use wiremock::ResponseTemplate; use crate::integration::common::graph_os_enabled; @@ -28,6 +29,9 @@ struct TraceSpec { span_names: HashSet<&'static str>, measured_spans: HashSet<&'static str>, unmeasured_spans: HashSet<&'static str>, + priority_sampled: Option<&'static str>, + // Not the metrics but the otel attribute + no_priority_sampled_attribute: Option, } #[tokio::test(flavor = "multi_thread")] @@ -35,8 +39,8 @@ async fn test_no_sample() -> Result<(), BoxError> { if !graph_os_enabled() { return Ok(()); } - let subgraph_was_sampled = std::sync::Arc::new(AtomicBool::new(false)); - let subgraph_was_sampled_callback = subgraph_was_sampled.clone(); + let context = std::sync::Arc::new(std::sync::Mutex::new(None)); + let context_clone = context.clone(); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Datadog) .config(include_str!("fixtures/datadog_no_sample.router.yaml")) @@ -44,8 +48,10 @@ async fn test_no_sample() -> Result<(), BoxError> { json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), )) .subgraph_callback(Box::new(move || { - let sampled = Span::current().context().span().span_context().is_sampled(); - subgraph_was_sampled_callback.store(sampled, std::sync::atomic::Ordering::SeqCst); + let context = Context::current(); + let span = context.span(); + let span_context = span.span_context(); + *context_clone.lock().expect("poisoned") = Some(span_context.clone()); })) .build() .await; @@ -54,14 +60,318 @@ async fn test_no_sample() -> Result<(), BoxError> { router.assert_started().await; let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (_id, result) = router.execute_untraced_query(&query).await; + let (_id, result) = router.execute_untraced_query(&query, None).await; router.graceful_shutdown().await; assert!(result.status().is_success()); - assert!(!subgraph_was_sampled.load(std::sync::atomic::Ordering::SeqCst)); + let context = context + .lock() + .expect("poisoned") + .as_ref() + .expect("state") + .clone(); + assert!(context.is_sampled()); + assert_eq!(context.trace_state().get("psr"), Some("0")); Ok(()) } +// We want to check we're able to override the behavior of preview_datadog_agent_sampling configuration even if we set a datadog exporter +#[tokio::test(flavor = "multi_thread")] +async fn test_sampling_datadog_agent_disabled() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let context = std::sync::Arc::new(std::sync::Mutex::new(None)); + let context_clone = context.clone(); + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!( + "fixtures/datadog_agent_sampling_disabled.router.yaml" + )) + .responder(ResponseTemplate::new(200).set_body_json( + json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), + )) + .subgraph_callback(Box::new(move || { + let context = Context::current(); + let span = context.span(); + let span_context = span.span_context(); + *context_clone.lock().expect("poisoned") = Some(span_context.clone()); + })) + .build() + .await; + + router.start().await; + router.assert_started().await; + + let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); + let (id, result) = router.execute_untraced_query(&query, None).await; + router.graceful_shutdown().await; + assert!(result.status().is_success()); + let _context = context + .lock() + .expect("poisoned") + .as_ref() + .expect("state") + .clone(); + + tokio::time::sleep(Duration::from_secs(2)).await; + TraceSpec::builder() + .services([].into()) + .build() + .validate_trace(id) + .await?; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_priority_sampling_propagated() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let context = std::sync::Arc::new(std::sync::Mutex::new(None)); + let context_clone = context.clone(); + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!("fixtures/datadog.router.yaml")) + .responder(ResponseTemplate::new(200).set_body_json( + json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), + )) + .subgraph_callback(Box::new(move || { + let context = Context::current(); + let span = context.span(); + let span_context = span.span_context(); + *context_clone.lock().expect("poisoned") = Some(span_context.clone()); + })) + .build() + .await; + + router.start().await; + router.assert_started().await; + + // Parent based sampling. psr MUST be populated with the value that we pass in. + test_psr( + &context, + &mut router, + Some("-1"), + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("-1") + .build(), + ) + .await?; + test_psr( + &context, + &mut router, + Some("0"), + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("0") + .build(), + ) + .await?; + test_psr( + &context, + &mut router, + Some("1"), + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .build(), + ) + .await?; + test_psr( + &context, + &mut router, + Some("2"), + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("2") + .build(), + ) + .await?; + + // No psr was passed in the router is free to set it. This will be 1 as we are going to sample here. + test_psr( + &context, + &mut router, + None, + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .build(), + ) + .await?; + + router.graceful_shutdown().await; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_priority_sampling_propagated_otel_request() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let context = std::sync::Arc::new(std::sync::Mutex::new(None)); + let context_clone = context.clone(); + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Otlp { endpoint: None }) + .config(include_str!("fixtures/datadog.router.yaml")) + .responder(ResponseTemplate::new(200).set_body_json( + json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), + )) + .subgraph_callback(Box::new(move || { + let context = Context::current(); + let span = context.span(); + let span_context = span.span_context(); + *context_clone.lock().expect("poisoned") = Some(span_context.clone()); + })) + .build() + .await; + + router.start().await; + router.assert_started().await; + let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); + let (id, result) = router.execute_query(&query).await; + assert_eq!( + result + .headers() + .get("apollo-custom-trace-id") + .unwrap() + .to_str() + .unwrap(), + id.to_datadog() + ); + TraceSpec::builder() + .services(["router"].into()) + .priority_sampled("1") + .build() + .validate_trace(id) + .await?; + + router.graceful_shutdown().await; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let context = std::sync::Arc::new(std::sync::Mutex::new(None)); + let context_clone = context.clone(); + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!( + "fixtures/datadog_no_parent_sampler.router.yaml" + )) + .responder(ResponseTemplate::new(200).set_body_json( + json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), + )) + .subgraph_callback(Box::new(move || { + let context = Context::current(); + let span = context.span(); + let span_context = span.span_context(); + *context_clone.lock().expect("poisoned") = Some(span_context.clone()); + })) + .build() + .await; + + router.start().await; + router.assert_started().await; + + // The router will ignore the upstream PSR as parent based sampling is disabled. + test_psr( + &context, + &mut router, + Some("-1"), + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .build(), + ) + .await?; + test_psr( + &context, + &mut router, + Some("0"), + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .build(), + ) + .await?; + test_psr( + &context, + &mut router, + Some("1"), + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .build(), + ) + .await?; + test_psr( + &context, + &mut router, + Some("2"), + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .build(), + ) + .await?; + + test_psr( + &context, + &mut router, + None, + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .build(), + ) + .await?; + + router.graceful_shutdown().await; + + Ok(()) +} + +async fn test_psr( + context: &Arc>>, + router: &mut IntegrationTest, + psr: Option<&str>, + trace_spec: TraceSpec, +) -> Result<(), BoxError> { + let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); + let headers = if let Some(psr) = psr { + vec![("x-datadog-sampling-priority".to_string(), psr.to_string())] + } else { + vec![] + }; + let (id, result) = router + .execute_query_with_headers(&query, headers.into_iter().collect()) + .await; + + assert!(result.status().is_success()); + let context = context + .lock() + .expect("poisoned") + .as_ref() + .expect("state") + .clone(); + + assert_eq!( + context.trace_state().get("psr"), + trace_spec.priority_sampled + ); + trace_spec.validate_trace(id).await?; + Ok(()) +} + #[tokio::test(flavor = "multi_thread")] async fn test_default_span_names() -> Result<(), BoxError> { if !graph_os_enabled() { @@ -506,7 +816,7 @@ impl TraceSpec { async fn validate_trace(&self, id: TraceId) -> Result<(), BoxError> { let datadog_id = id.to_datadog(); let url = format!("http://localhost:8126/test/traces?trace_ids={datadog_id}"); - for _ in 0..10 { + for _ in 0..20 { if self.find_valid_trace(&url).await.is_ok() { return Ok(()); } @@ -533,11 +843,12 @@ impl TraceSpec { tracing::debug!("{}", serde_json::to_string_pretty(&trace)?); self.verify_trace_participants(&trace)?; self.verify_spans_present(&trace)?; - self.validate_measured_spans(&trace)?; + self.verify_measured_spans(&trace)?; self.verify_operation_name(&trace)?; self.verify_priority_sampled(&trace)?; + self.verify_priority_sampled_attribute(&trace)?; self.verify_version(&trace)?; - self.validate_span_kinds(&trace)?; + self.verify_span_kinds(&trace)?; Ok(()) } @@ -556,7 +867,7 @@ impl TraceSpec { Ok(()) } - fn validate_measured_spans(&self, trace: &Value) -> Result<(), BoxError> { + fn verify_measured_spans(&self, trace: &Value) -> Result<(), BoxError> { for expected in &self.measured_spans { assert!( self.measured_span(trace, expected)?, @@ -591,11 +902,13 @@ impl TraceSpec { .unwrap_or_default()) } - fn validate_span_kinds(&self, trace: &Value) -> Result<(), BoxError> { + fn verify_span_kinds(&self, trace: &Value) -> Result<(), BoxError> { // Validate that the span.kind has been propagated. We can just do this for a selection of spans. - self.validate_span_kind(trace, "router", "server")?; - self.validate_span_kind(trace, "supergraph", "internal")?; - self.validate_span_kind(trace, "http_request", "client")?; + if self.services.contains("router") { + self.validate_span_kind(trace, "router", "server")?; + self.validate_span_kind(trace, "supergraph", "internal")?; + self.validate_span_kind(trace, "http_request", "client")?; + } Ok(()) } @@ -652,19 +965,24 @@ impl TraceSpec { trace.select_path(&format!("$..[?(@.name == '{}')].meta.['span.kind']", name))?; let binding = binding1.first().or(binding2.first()); - assert!( - binding.is_some(), - "span.kind missing or incorrect {}, {}", - name, - trace - ); - assert_eq!( - binding - .expect("expected binding") - .as_str() - .expect("expected string"), - kind - ); + if binding.is_none() { + return Err(BoxError::from(format!( + "span.kind missing or incorrect {}, {}", + name, trace + ))); + } + + let binding = binding + .expect("expected binding") + .as_str() + .expect("expected string"); + if binding != kind { + return Err(BoxError::from(format!( + "span.kind mismatch, expected {} got {}", + kind, binding + ))); + } + Ok(()) } @@ -685,17 +1003,35 @@ impl TraceSpec { } fn verify_priority_sampled(&self, trace: &Value) -> Result<(), BoxError> { - let binding = trace.select_path("$.._sampling_priority_v1")?; - let sampling_priority = binding.first(); - // having this priority set to 1.0 everytime is not a problem as we're doing pre sampling in the full telemetry stack - // So basically if the trace was not sampled it wouldn't get to this stage and so nothing would be sent - assert_eq!( - sampling_priority - .expect("sampling priority expected") - .as_f64() - .expect("sampling priority must be a number"), - 1.0 - ); + if let Some(psr) = self.priority_sampled { + let binding = + trace.select_path("$..[?(@.service=='router')].metrics._sampling_priority_v1")?; + if binding.is_empty() { + return Err(BoxError::from("missing sampling priority")); + } + for sampling_priority in binding { + assert_eq!( + sampling_priority + .as_f64() + .expect("psr not string") + .to_string(), + psr + ); + } + } + Ok(()) + } + + fn verify_priority_sampled_attribute(&self, trace: &Value) -> Result<(), BoxError> { + if self.no_priority_sampled_attribute.unwrap_or_default() { + let binding = + trace.select_path("$..[?(@.service=='router')].meta['sampling.priority']")?; + if binding.is_empty() { + return Ok(()); + } else { + return Err(BoxError::from("sampling priority attribute exists")); + } + } Ok(()) } } diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog.router.yaml index d6ecc66607..0f0f50dd78 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog.router.yaml @@ -13,6 +13,7 @@ telemetry: resource: env: local1 service.version: router_version_override + preview_datadog_agent_sampling: true datadog: enabled: true batch_processor: diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled.router.yaml new file mode 100644 index 0000000000..49b1528c94 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled.router.yaml @@ -0,0 +1,23 @@ +telemetry: + apollo: + field_level_instrumentation_sampler: always_off + exporters: + tracing: + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + # NOT always_off to allow us to test a sampling probability of zero + sampler: 0.0 + preview_datadog_agent_sampling: false + datadog: + enabled: true + batch_processor: + scheduled_delay: 100ms + fixed_span_names: false + enable_span_mapping: false + instrumentation: + spans: + mode: spec_compliant + diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_default_span_names.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_default_span_names.router.yaml index 67c2c070e6..e874c00fab 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog_default_span_names.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_default_span_names.router.yaml @@ -7,6 +7,7 @@ telemetry: format: datadog common: service_name: router + preview_datadog_agent_sampling: true datadog: enabled: true batch_processor: diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_no_parent_sampler.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_no_parent_sampler.router.yaml new file mode 100644 index 0000000000..2e9c634dd9 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_no_parent_sampler.router.yaml @@ -0,0 +1,28 @@ +telemetry: + exporters: + tracing: + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + format: datadog + propagation: + trace_context: true + jaeger: true + common: + service_name: router + parent_based_sampler: false + resource: + env: local1 + service.version: router_version_override + preview_datadog_agent_sampling: true + datadog: + enabled: true + batch_processor: + scheduled_delay: 100ms + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_no_sample.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_no_sample.router.yaml index d89d104346..19af041c56 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog_no_sample.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_no_sample.router.yaml @@ -11,6 +11,7 @@ telemetry: service_name: router # NOT always_off to allow us to test a sampling probability of zero sampler: 0.0 + preview_datadog_agent_sampling: true datadog: enabled: true batch_processor: diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_override_span_names.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_override_span_names.router.yaml index 7d5e1ff2e1..bb793301d0 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog_override_span_names.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_override_span_names.router.yaml @@ -7,6 +7,7 @@ telemetry: format: datadog common: service_name: router + preview_datadog_agent_sampling: true datadog: enabled: true # Span mapping will always override the span name as far as the test agent is concerned diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_override_span_names_late.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_override_span_names_late.router.yaml index dda383a784..821662b5be 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog_override_span_names_late.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_override_span_names_late.router.yaml @@ -7,6 +7,7 @@ telemetry: format: datadog common: service_name: router + preview_datadog_agent_sampling: true datadog: enabled: true # Span mapping will always override the span name as far as the test agent is concerned diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_resource_mapping_default.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_resource_mapping_default.router.yaml index 96160b1831..0603e72c9c 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog_resource_mapping_default.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_resource_mapping_default.router.yaml @@ -7,6 +7,7 @@ telemetry: format: datadog common: service_name: router + preview_datadog_agent_sampling: true datadog: enabled: true enable_span_mapping: true diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_resource_mapping_override.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_resource_mapping_override.router.yaml index a01c44fc61..5eba22068b 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog_resource_mapping_override.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_resource_mapping_override.router.yaml @@ -7,6 +7,7 @@ telemetry: format: datadog common: service_name: router + preview_datadog_agent_sampling: true datadog: enabled: true enable_span_mapping: true diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp.router.yaml index f4484786f4..aa56c66187 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/otlp.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp.router.yaml @@ -9,7 +9,7 @@ telemetry: otlp: enabled: true protocol: http - endpoint: /traces + endpoint: batch_processor: scheduled_delay: 10ms metrics: @@ -22,3 +22,15 @@ telemetry: batch_processor: scheduled_delay: 10ms + + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + + subgraph: + attributes: + otel.name: + subgraph_operation_name: string \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_no_sample.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_no_sample.router.yaml new file mode 100644 index 0000000000..77529f500d --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_no_sample.router.yaml @@ -0,0 +1,42 @@ +telemetry: + apollo: + field_level_instrumentation_sampler: always_off + exporters: + tracing: + propagation: + datadog: true + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + preview_datadog_agent_sampling: true + sampler: 0.0 + otlp: + enabled: true + protocol: http + endpoint: + batch_processor: + scheduled_delay: 10ms + metrics: + common: + service_name: router + otlp: + enabled: true + endpoint: /metrics + protocol: http + batch_processor: + scheduled_delay: 10ms + + + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + + subgraph: + attributes: + otel.name: + subgraph_operation_name: string \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_sample.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_sample.router.yaml new file mode 100644 index 0000000000..6b1f32f71f --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_sample.router.yaml @@ -0,0 +1,42 @@ +telemetry: + apollo: + field_level_instrumentation_sampler: always_off + exporters: + tracing: + propagation: + datadog: true + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + preview_datadog_agent_sampling: true + sampler: 1.0 + otlp: + enabled: true + protocol: http + endpoint: + batch_processor: + scheduled_delay: 10ms + metrics: + common: + service_name: router + otlp: + enabled: true + endpoint: /metrics + protocol: http + batch_processor: + scheduled_delay: 10ms + + + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + + subgraph: + attributes: + otel.name: + subgraph_operation_name: string \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_sample_no_sample.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_sample_no_sample.router.yaml new file mode 100644 index 0000000000..77529f500d --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_agent_sample_no_sample.router.yaml @@ -0,0 +1,42 @@ +telemetry: + apollo: + field_level_instrumentation_sampler: always_off + exporters: + tracing: + propagation: + datadog: true + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + preview_datadog_agent_sampling: true + sampler: 0.0 + otlp: + enabled: true + protocol: http + endpoint: + batch_processor: + scheduled_delay: 10ms + metrics: + common: + service_name: router + otlp: + enabled: true + endpoint: /metrics + protocol: http + batch_processor: + scheduled_delay: 10ms + + + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + + subgraph: + attributes: + otel.name: + subgraph_operation_name: string \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation.router.yaml new file mode 100644 index 0000000000..7352f3d620 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation.router.yaml @@ -0,0 +1,39 @@ +telemetry: + exporters: + tracing: + propagation: + datadog: true + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + preview_datadog_agent_sampling: true + otlp: + enabled: true + protocol: http + endpoint: + batch_processor: + scheduled_delay: 10ms + metrics: + common: + service_name: router + otlp: + enabled: true + endpoint: /metrics + protocol: http + batch_processor: + scheduled_delay: 10ms + + + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + + subgraph: + attributes: + otel.name: + subgraph_operation_name: string \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation_no_agent.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation_no_agent.router.yaml new file mode 100644 index 0000000000..08323073f3 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation_no_agent.router.yaml @@ -0,0 +1,38 @@ +telemetry: + exporters: + tracing: + propagation: + datadog: true + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + otlp: + enabled: true + protocol: http + endpoint: + batch_processor: + scheduled_delay: 10ms + metrics: + common: + service_name: router + otlp: + enabled: true + endpoint: /metrics + protocol: http + batch_processor: + scheduled_delay: 10ms + + + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + + subgraph: + attributes: + otel.name: + subgraph_operation_name: string \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation_no_parent_sampler.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation_no_parent_sampler.router.yaml new file mode 100644 index 0000000000..7fd47f096b --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_propagation_no_parent_sampler.router.yaml @@ -0,0 +1,40 @@ +telemetry: + exporters: + tracing: + propagation: + datadog: true + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + parent_based_sampler: false + preview_datadog_agent_sampling: true + service_name: router + otlp: + enabled: true + protocol: http + endpoint: + batch_processor: + scheduled_delay: 10ms + metrics: + common: + service_name: router + otlp: + enabled: true + endpoint: /metrics + protocol: http + batch_processor: + scheduled_delay: 10ms + + + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + + subgraph: + attributes: + otel.name: + subgraph_operation_name: string \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_request_with_zipkin_propagator.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_request_with_zipkin_propagator.router.yaml new file mode 100644 index 0000000000..4e31e0d1d6 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_request_with_zipkin_propagator.router.yaml @@ -0,0 +1,40 @@ +telemetry: + apollo: + field_level_instrumentation_sampler: always_off + exporters: + tracing: + propagation: + zipkin: true + trace_context: true + common: + service_name: router + preview_datadog_agent_sampling: true + sampler: 1.0 + otlp: + enabled: true + protocol: http + endpoint: + batch_processor: + scheduled_delay: 10ms + metrics: + common: + service_name: router + otlp: + enabled: true + endpoint: /metrics + protocol: http + batch_processor: + scheduled_delay: 10ms + + + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + + subgraph: + attributes: + otel.name: + subgraph_operation_name: string \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_no_parent_sampler.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_no_parent_sampler.router.yaml new file mode 100644 index 0000000000..5fdf22e0d6 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_no_parent_sampler.router.yaml @@ -0,0 +1,25 @@ +telemetry: + exporters: + tracing: + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + parent_based_sampler: false + otlp: + enabled: true + protocol: http + endpoint: /traces + batch_processor: + scheduled_delay: 10ms + metrics: + common: + service_name: router + otlp: + enabled: true + endpoint: /metrics + protocol: http + batch_processor: + scheduled_delay: 10ms + diff --git a/apollo-router/tests/integration/telemetry/jaeger.rs b/apollo-router/tests/integration/telemetry/jaeger.rs index fcf59e4ef5..c9e79bd22a 100644 --- a/apollo-router/tests/integration/telemetry/jaeger.rs +++ b/apollo-router/tests/integration/telemetry/jaeger.rs @@ -90,7 +90,7 @@ async fn test_local_root() -> Result<(), BoxError> { router.assert_started().await; let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_untraced_query(&query).await; + let (id, result) = router.execute_untraced_query(&query, None).await; assert!(!result .headers() .get("apollo-custom-trace-id") @@ -121,7 +121,7 @@ async fn test_local_root_no_sample() -> Result<(), BoxError> { router.assert_started().await; let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (_, response) = router.execute_untraced_query(&query).await; + let (_, response) = router.execute_untraced_query(&query, None).await; assert!(response.headers().get("apollo-custom-trace-id").is_some()); router.graceful_shutdown().await; @@ -141,7 +141,7 @@ async fn test_local_root_50_percent_sample() -> Result<(), BoxError> { let query = json!({"query":"query ExampleQuery {topProducts{name}}\n","variables":{}, "operationName": "ExampleQuery"}); for _ in 0..100 { - let (id, result) = router.execute_untraced_query(&query).await; + let (id, result) = router.execute_untraced_query(&query, None).await; if result.headers().get("apollo-custom-trace-id").is_some() && validate_trace( @@ -177,7 +177,7 @@ async fn test_no_telemetry() -> Result<(), BoxError> { router.assert_started().await; let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (_, response) = router.execute_untraced_query(&query).await; + let (_, response) = router.execute_untraced_query(&query, None).await; assert!(response.headers().get("apollo-custom-trace-id").is_none()); router.graceful_shutdown().await; diff --git a/apollo-router/tests/integration/telemetry/otlp.rs b/apollo-router/tests/integration/telemetry/otlp.rs index 7eae04f567..0ba9178cec 100644 --- a/apollo-router/tests/integration/telemetry/otlp.rs +++ b/apollo-router/tests/integration/telemetry/otlp.rs @@ -1,10 +1,10 @@ extern crate core; +use std::collections::HashMap; use std::collections::HashSet; use std::time::Duration; use anyhow::anyhow; -use itertools::Itertools; use opentelemetry_api::trace::TraceId; use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceResponse; use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceResponse; @@ -18,37 +18,22 @@ use wiremock::Mock; use wiremock::MockServer; use wiremock::ResponseTemplate; +use crate::integration::common::graph_os_enabled; use crate::integration::common::Telemetry; use crate::integration::IntegrationTest; use crate::integration::ValueExt; #[tokio::test(flavor = "multi_thread")] async fn test_basic() -> Result<(), BoxError> { - let mock_server = wiremock::MockServer::start().await; - Mock::given(method("POST")) - .and(path("/traces")) - .respond_with(ResponseTemplate::new(200).set_body_raw( - ExportTraceServiceResponse::default().encode_to_vec(), - "application/x-protobuf", - )) - .expect(1..) - .mount(&mock_server) - .await; - Mock::given(method("POST")) - .and(path("/metrics")) - .respond_with(ResponseTemplate::new(200).set_body_raw( - ExportMetricsServiceResponse::default().encode_to_vec(), - "application/x-protobuf", - )) - .expect(1..) - .mount(&mock_server) - .await; - + if !graph_os_enabled() { + panic!("Error: test skipped because GraphOS is not enabled"); + } + let mock_server = mock_otlp_server().await; let config = include_str!("fixtures/otlp.router.yaml") .replace("", &mock_server.uri()); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Otlp { - endpoint: format!("{}/traces", mock_server.uri()), + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), }) .config(&config) .build() @@ -65,15 +50,31 @@ async fn test_basic() -> Result<(), BoxError> { .get("apollo-custom-trace-id") .unwrap() .is_empty()); - validate_telemetry( - &mock_server, - id, - &query, - Some("ExampleQuery"), - &["client", "router", "subgraph"], - false, - ) - .await?; + Spec::builder() + .operation_name("ExampleQuery") + .services(["client", "router", "subgraph"].into()) + .span_names( + [ + "query_planning", + "client_request", + "ExampleQuery__products__0", + "fetch", + "execution", + "query ExampleQuery", + "subgraph server", + "parse_query", + "http_request", + ] + .into(), + ) + .build() + .validate_trace(id, &mock_server) + .await?; + Spec::builder() + .service("router") + .build() + .validate_metrics(&mock_server) + .await?; router.touch_config().await; router.assert_reloaded().await; } @@ -81,146 +82,745 @@ async fn test_basic() -> Result<(), BoxError> { Ok(()) } -async fn validate_telemetry( - mock_server: &MockServer, - _id: TraceId, - query: &Value, - operation_name: Option<&str>, - services: &[&'static str], - custom_span_instrumentation: bool, +#[tokio::test(flavor = "multi_thread")] +async fn test_otlp_request_with_datadog_propagator() -> Result<(), BoxError> { + if !graph_os_enabled() { + panic!("Error: test skipped because GraphOS is not enabled"); + } + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp_datadog_propagation.router.yaml") + .replace("", &mock_server.uri()); + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Otlp { + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), + }) + .config(&config) + .build() + .await; + + router.start().await; + router.assert_started().await; + + let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); + let (id, _) = router.execute_query(&query).await; + Spec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .build() + .validate_trace(id, &mock_server) + .await?; + router.graceful_shutdown().await; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_otlp_request_with_datadog_propagator_no_agent() -> Result<(), BoxError> { + if !graph_os_enabled() { + panic!("Error: test skipped because GraphOS is not enabled"); + } + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp_datadog_propagation_no_agent.router.yaml") + .replace("", &mock_server.uri()); + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Otlp { + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), + }) + .config(&config) + .build() + .await; + + router.start().await; + router.assert_started().await; + + let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); + let (id, _) = router.execute_query(&query).await; + Spec::builder() + .services(["client", "router", "subgraph"].into()) + .build() + .validate_trace(id, &mock_server) + .await?; + router.graceful_shutdown().await; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_otlp_request_with_zipkin_trace_context_propagator_with_datadog( ) -> Result<(), BoxError> { - for _ in 0..10 { - let trace_valid = find_valid_trace( - mock_server, - query, - operation_name, - services, - custom_span_instrumentation, - ) + if !graph_os_enabled() { + panic!("Error: test skipped because GraphOS is not enabled"); + } + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp_datadog_request_with_zipkin_propagator.router.yaml") + .replace("", &mock_server.uri()); + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Otlp { + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), + }) + .config(&config) + .build() .await; - let metrics_valid = find_valid_metrics(mock_server, query, operation_name, services).await; + router.start().await; + router.assert_started().await; - if metrics_valid.is_ok() && trace_valid.is_ok() { - return Ok(()); - } + let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); + let (id, _) = router.execute_query(&query).await; + + Spec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .build() + .validate_trace(id, &mock_server) + .await?; + // ---------------------- zipkin propagator with unsampled trace + // Testing for an unsampled trace, so it should be sent to the otlp exporter with sampling priority set 0 + // But it shouldn't send the trace to subgraph as the trace is originally not sampled, the main goal is to measure it at the DD agent level + let id = TraceId::from_hex("80f198ee56343ba864fe8b2a57d3eff7").unwrap(); + let headers: HashMap = [ + ( + "X-B3-TraceId".to_string(), + "80f198ee56343ba864fe8b2a57d3eff7".to_string(), + ), + ( + "X-B3-ParentSpanId".to_string(), + "05e3ac9a4f6e3b90".to_string(), + ), + ("X-B3-SpanId".to_string(), "e457b5a2e4d86bd1".to_string()), + ("X-B3-Sampled".to_string(), "0".to_string()), + ] + .into(); + + let (_id, _) = router.execute_untraced_query(&query, Some(headers)).await; + Spec::builder() + .services(["router"].into()) + .priority_sampled("0") + .build() + .validate_trace(id, &mock_server) + .await?; + // ---------------------- trace context propagation + // Testing for a trace containing the right tracestate with m and psr for DD and a sampled trace, so it should be sent to the otlp exporter with sampling priority set to 1 + // And it should also send the trace to subgraph as the trace is sampled + let id = TraceId::from_hex("0af7651916cd43dd8448eb211c80319c").unwrap(); + let headers: HashMap = [ + ( + "traceparent".to_string(), + "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01".to_string(), + ), + ("tracestate".to_string(), "m=1,psr=1".to_string()), + ] + .into(); + + let (_id, _) = router.execute_untraced_query(&query, Some(headers)).await; + Spec::builder() + .services(["router", "subgraph"].into()) + .priority_sampled("1") + .build() + .validate_trace(id, &mock_server) + .await?; + // ---------------------- + // Testing for a trace containing the right tracestate with m and psr for DD and an unsampled trace, so it should be sent to the otlp exporter with sampling priority set to 0 + // But it shouldn't send the trace to subgraph as the trace is originally not sampled, the main goal is to measure it at the DD agent level + let id = TraceId::from_hex("0af7651916cd43dd8448eb211c80319d").unwrap(); + let headers: HashMap = [ + ( + "traceparent".to_string(), + "00-0af7651916cd43dd8448eb211c80319d-b7ad6b7169203331-00".to_string(), + ), + ("tracestate".to_string(), "m=1,psr=0".to_string()), + ] + .into(); + + let (_id, _) = router.execute_untraced_query(&query, Some(headers)).await; + Spec::builder() + .services(["router"].into()) + .priority_sampled("0") + .build() + .validate_trace(id, &mock_server) + .await?; + // ---------------------- + // Testing for a trace containing a tracestate m and psr with psr set to 1 for DD and an unsampled trace, so it should be sent to the otlp exporter with sampling priority set to 1 + // It should not send the trace to the subgraph as we didn't use the datadog propagator and therefore the trace will remain unsampled. + let id = TraceId::from_hex("0af7651916cd43dd8448eb211c80319e").unwrap(); + let headers: HashMap = [ + ( + "traceparent".to_string(), + "00-0af7651916cd43dd8448eb211c80319e-b7ad6b7169203331-00".to_string(), + ), + ("tracestate".to_string(), "m=1,psr=1".to_string()), + ] + .into(); + + let (_id, _) = router.execute_untraced_query(&query, Some(headers)).await; + Spec::builder() + .services(["router"].into()) + .priority_sampled("1") + .build() + .validate_trace(id, &mock_server) + .await?; + + // Be careful if you add the same kind of test crafting your own trace id, make sure to increment the previous trace id by 1 if not you'll receive all the previous spans tested with the same trace id before + router.graceful_shutdown().await; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_untraced_request_no_sample_datadog_agent() -> Result<(), BoxError> { + if !graph_os_enabled() { + panic!("Error: test skipped because GraphOS is not enabled"); + } + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp_datadog_agent_no_sample.router.yaml") + .replace("", &mock_server.uri()); + let mut router = IntegrationTest::builder().config(&config).build().await; + + router.start().await; + router.assert_started().await; + + let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); + let (id, _) = router.execute_untraced_query(&query, None).await; + Spec::builder() + .services(["router"].into()) + .priority_sampled("0") + .build() + .validate_trace(id, &mock_server) + .await?; + router.graceful_shutdown().await; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_untraced_request_sample_datadog_agent() -> Result<(), BoxError> { + if !graph_os_enabled() { + panic!("Error: test skipped because GraphOS is not enabled"); + } + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp_datadog_agent_sample.router.yaml") + .replace("", &mock_server.uri()); + let mut router = IntegrationTest::builder().config(&config).build().await; + + router.start().await; + router.assert_started().await; + + let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); + let (id, _) = router.execute_untraced_query(&query, None).await; + Spec::builder() + .services(["router"].into()) + .priority_sampled("1") + .build() + .validate_trace(id, &mock_server) + .await?; + router.graceful_shutdown().await; + Ok(()) +} - tokio::time::sleep(Duration::from_millis(100)).await; +#[tokio::test(flavor = "multi_thread")] +async fn test_untraced_request_sample_datadog_agent_unsampled() -> Result<(), BoxError> { + if !graph_os_enabled() { + panic!("Error: test skipped because GraphOS is not enabled"); } - find_valid_trace( - mock_server, - query, - operation_name, - services, - custom_span_instrumentation, + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp_datadog_agent_sample_no_sample.router.yaml") + .replace("", &mock_server.uri()); + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Otlp { + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), + }) + .config(&config) + .build() + .await; + + router.start().await; + router.assert_started().await; + + let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); + let (id, _) = router.execute_untraced_query(&query, None).await; + Spec::builder() + .services(["router"].into()) + .priority_sampled("0") + .build() + .validate_trace(id, &mock_server) + .await?; + router.graceful_shutdown().await; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_priority_sampling_propagated() -> Result<(), BoxError> { + if !graph_os_enabled() { + panic!("Error: test skipped because GraphOS is not enabled"); + } + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp_datadog_propagation.router.yaml") + .replace("", &mock_server.uri()); + let mut router = IntegrationTest::builder() + // We're using datadog propagation as this is what we are trying to test. + .telemetry(Telemetry::Datadog) + .config(config) + .responder(ResponseTemplate::new(200).set_body_json( + json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), + )) + .build() + .await; + + router.start().await; + router.assert_started().await; + + // Parent based sampling. psr MUST be populated with the value that we pass in. + test_psr( + &mut router, + Some("-1"), + Spec::builder() + .services(["router"].into()) + .priority_sampled("-1") + .build(), + &mock_server, + ) + .await?; + test_psr( + &mut router, + Some("0"), + Spec::builder() + .services(["router"].into()) + .priority_sampled("0") + .build(), + &mock_server, + ) + .await?; + test_psr( + &mut router, + Some("1"), + Spec::builder() + .services(["router"].into()) + .priority_sampled("1") + .build(), + &mock_server, + ) + .await?; + test_psr( + &mut router, + Some("2"), + Spec::builder() + .services(["router"].into()) + .priority_sampled("2") + .build(), + &mock_server, + ) + .await?; + + // No psr was passed in the router is free to set it. This will be 1 as we are going to sample here. + test_psr( + &mut router, + None, + Spec::builder() + .services(["router"].into()) + .priority_sampled("1") + .build(), + &mock_server, + ) + .await?; + + router.graceful_shutdown().await; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let mock_server = mock_otlp_server().await; + let config = include_str!("fixtures/otlp_datadog_propagation_no_parent_sampler.router.yaml") + .replace("", &mock_server.uri()); + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(config) + .responder(ResponseTemplate::new(200).set_body_json( + json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), + )) + .build() + .await; + + router.start().await; + router.assert_started().await; + + // The router will ignore the upstream PSR as parent based sampling is disabled. + test_psr( + &mut router, + Some("-1"), + Spec::builder() + .services(["router"].into()) + .priority_sampled("1") + .build(), + &mock_server, + ) + .await?; + test_psr( + &mut router, + Some("0"), + Spec::builder() + .services(["router"].into()) + .priority_sampled("1") + .build(), + &mock_server, + ) + .await?; + test_psr( + &mut router, + Some("1"), + Spec::builder() + .services(["router"].into()) + .priority_sampled("1") + .build(), + &mock_server, + ) + .await?; + test_psr( + &mut router, + Some("2"), + Spec::builder() + .services(["router"].into()) + .priority_sampled("1") + .build(), + &mock_server, ) .await?; - find_valid_metrics(mock_server, query, operation_name, services).await?; + + test_psr( + &mut router, + None, + Spec::builder() + .services(["router"].into()) + .priority_sampled("1") + .build(), + &mock_server, + ) + .await?; + + router.graceful_shutdown().await; Ok(()) } -async fn find_valid_trace( +async fn test_psr( + router: &mut IntegrationTest, + psr: Option<&str>, + trace_spec: Spec, mock_server: &MockServer, - _query: &Value, - _operation_name: Option<&str>, - services: &[&'static str], - _custom_span_instrumentation: bool, ) -> Result<(), BoxError> { - let requests = mock_server - .received_requests() - .await - .expect("Could not get otlp requests"); - - // A valid trace has: - // * A valid service name - // * All three services - // * The correct spans - // * All spans are parented - // * Required attributes of 'router' span has been set - let traces: Vec<_>= requests - .iter() - .filter_map(|r| { - if r.url.path().ends_with("/traces") { + let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); + let headers = if let Some(psr) = psr { + vec![("x-datadog-sampling-priority".to_string(), psr.to_string())] + } else { + vec![] + }; + let (id, result) = router + .execute_query_with_headers(&query, headers.into_iter().collect()) + .await; + + assert!(result.status().is_success()); + trace_spec.validate_trace(id, mock_server).await?; + Ok(()) +} + +#[derive(buildstructor::Builder)] +struct Spec { + operation_name: Option, + version: Option, + services: HashSet<&'static str>, + span_names: HashSet<&'static str>, + measured_spans: HashSet<&'static str>, + unmeasured_spans: HashSet<&'static str>, + priority_sampled: Option<&'static str>, +} + +impl Spec { + #[allow(clippy::too_many_arguments)] + async fn validate_trace(&self, id: TraceId, mock_server: &MockServer) -> Result<(), BoxError> { + for _ in 0..10 { + if self.find_valid_trace(id, mock_server).await.is_ok() { + return Ok(()); + } + tokio::time::sleep(Duration::from_millis(100)).await; + } + self.find_valid_trace(id, mock_server).await?; + Ok(()) + } + + async fn validate_metrics(&self, mock_server: &MockServer) -> Result<(), BoxError> { + for _ in 0..10 { + if self.find_valid_metrics(mock_server).await.is_ok() { + return Ok(()); + } + tokio::time::sleep(Duration::from_millis(100)).await; + } + self.find_valid_metrics(mock_server).await?; + Ok(()) + } + + #[allow(clippy::too_many_arguments)] + async fn find_valid_trace( + &self, + trace_id: TraceId, + mock_server: &MockServer, + ) -> Result<(), BoxError> { + // A valid trace has: + // * All three services + // * The correct spans + // * All spans are parented + // * Required attributes of 'router' span has been set + + let requests = mock_server.received_requests().await; + let trace= Value::Array(requests.unwrap_or_default().iter().filter(|r| r.url.path().ends_with("/traces")) + .filter_map(|r|{ match opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest::decode( bytes::Bytes::copy_from_slice(&r.body), ) { Ok(trace) => { match serde_json::to_value(trace) { - Ok(trace) => { Some(Ok(trace)) } - Err(e) => { - Some(Err(BoxError::from(format!("failed to decode trace: {}", e)))) + Ok(trace) => { + Some(trace) } + Err(_) => { + None } } } - Err(e) => { - Some(Err(BoxError::from(format!("failed to decode trace: {}", e)))) + Err(_) => { + None } } + }).filter(|t| { + + let datadog_trace_id = TraceId::from_u128(trace_id.to_datadog() as u128); + let trace_found1 = !t.select_path(&format!("$..[?(@.traceId == '{}')]", trace_id)).unwrap_or_default().is_empty(); + let trace_found2 = !t.select_path(&format!("$..[?(@.traceId == '{}')]", datadog_trace_id)).unwrap_or_default().is_empty(); + trace_found1 | trace_found2 + }).collect()); + + self.verify_services(&trace)?; + self.verify_spans_present(&trace)?; + self.verify_measured_spans(&trace)?; + self.verify_operation_name(&trace)?; + self.verify_priority_sampled(&trace)?; + self.verify_version(&trace)?; + self.verify_span_kinds(&trace)?; + + Ok(()) + } + + fn verify_version(&self, trace: &Value) -> Result<(), BoxError> { + if let Some(expected_version) = &self.version { + let binding = trace.select_path("$..version")?; + let version = binding.first(); + assert_eq!( + version + .expect("version expected") + .as_str() + .expect("version must be a string"), + expected_version + ); + } + Ok(()) + } + + fn verify_measured_spans(&self, trace: &Value) -> Result<(), BoxError> { + for expected in &self.measured_spans { + assert!( + self.measured_span(trace, expected)?, + "missing measured span {}", + expected + ); + } + for unexpected in &self.unmeasured_spans { + assert!( + !self.measured_span(trace, unexpected)?, + "unexpected measured span {}", + unexpected + ); + } + Ok(()) + } + + fn measured_span(&self, trace: &Value, name: &str) -> Result { + let binding1 = trace.select_path(&format!( + "$..[?(@.meta.['otel.original_name'] == '{}')].metrics.['_dd.measured']", + name + ))?; + let binding2 = trace.select_path(&format!( + "$..[?(@.name == '{}')].metrics.['_dd.measured']", + name + ))?; + Ok(binding1 + .first() + .or(binding2.first()) + .and_then(|v| v.as_f64()) + .map(|v| v == 1.0) + .unwrap_or_default()) + } + + fn verify_span_kinds(&self, trace: &Value) -> Result<(), BoxError> { + // Validate that the span.kind has been propagated. We can just do this for a selection of spans. + self.validate_span_kind(trace, "router", "server")?; + self.validate_span_kind(trace, "supergraph", "internal")?; + self.validate_span_kind(trace, "http_request", "client")?; + Ok(()) + } + + fn verify_services(&self, trace: &Value) -> Result<(), BoxError> { + let actual_services: HashSet = trace + .select_path("$..resource.attributes..[?(@.key == 'service.name')].value.stringValue")? + .into_iter() + .filter_map(|service| service.as_string()) + .collect(); + tracing::debug!("found services {:?}", actual_services); + let expected_services = self + .services + .iter() + .map(|s| s.to_string()) + .collect::>(); + if actual_services != expected_services { + return Err(BoxError::from(format!( + "incomplete traces, got {actual_services:?} expected {expected_services:?}" + ))); + } + Ok(()) + } + + fn verify_spans_present(&self, trace: &Value) -> Result<(), BoxError> { + let operation_names: HashSet = trace + .select_path("$..spans..name")? + .into_iter() + .filter_map(|span_name| span_name.as_string()) + .collect(); + let mut span_names: HashSet<&str> = self.span_names.clone(); + if self.services.contains("client") { + span_names.insert("client_request"); + } + tracing::debug!("found spans {:?}", operation_names); + let missing_operation_names: Vec<_> = span_names + .iter() + .filter(|o| !operation_names.contains(**o)) + .collect(); + if !missing_operation_names.is_empty() { + return Err(BoxError::from(format!( + "spans did not match, got {operation_names:?}, missing {missing_operation_names:?}" + ))); + } + Ok(()) + } + + fn validate_span_kind(&self, trace: &Value, name: &str, kind: &str) -> Result<(), BoxError> { + let kind = match kind { + "internal" => 1, + "client" => 3, + "server" => 2, + _ => panic!("unknown kind"), + }; + let binding1 = trace.select_path(&format!( + "$..spans..[?(@.kind == {})]..[?(@.key == 'otel.original_name')].value..[?(@ == '{}')]", + kind, name + ))?; + let binding2 = trace.select_path(&format!( + "$..spans..[?(@.kind == {} && @.name == '{}')]", + kind, name + ))?; + let binding = binding1.first().or(binding2.first()); + + if binding.is_none() { + return Err(BoxError::from(format!( + "span.kind missing or incorrect {}, {}", + name, kind + ))); + } + Ok(()) + } + + fn verify_operation_name(&self, trace: &Value) -> Result<(), BoxError> { + if let Some(expected_operation_name) = &self.operation_name { + let binding = + trace.select_path("$..[?(@.name == 'supergraph')]..[?(@.key == 'graphql.operation.name')].value.stringValue")?; + let operation_name = binding.first(); + assert_eq!( + operation_name + .expect("graphql.operation.name expected") + .as_str() + .expect("graphql.operation.name must be a string"), + expected_operation_name + ); + } + Ok(()) + } + + fn verify_priority_sampled(&self, trace: &Value) -> Result<(), BoxError> { + if let Some(psr) = self.priority_sampled { + let binding = trace.select_path( + "$..[?(@.name == 'execution')]..[?(@.key == 'sampling.priority')].value.intValue", + )?; + if binding.is_empty() { + return Err(BoxError::from("missing sampling priority")); } - else { - None + for sampling_priority in binding { + assert_eq!( + sampling_priority + .as_i64() + .expect("psr not an integer") + .to_string(), + psr + ); } - }) - .try_collect()?; - if !traces.is_empty() { - let json_trace = serde_json::Value::Array(traces); - verify_trace_participants(&json_trace, services)?; - + } else { + assert!(trace.select_path("$..[?(@.name == 'execution')]..[?(@.key == 'sampling.priority')].value.intValue")?.is_empty()) + } Ok(()) - } else { - Err(anyhow!("No traces received").into()) } -} -fn verify_trace_participants(trace: &Value, services: &[&'static str]) -> Result<(), BoxError> { - let actual_services: HashSet = trace - .select_path("$..resource.attributes[?(@.key=='service.name')].value.stringValue")? - .into_iter() - .filter_map(|service| service.as_string()) - .collect(); - tracing::debug!("found services {:?}", actual_services); - - let expected_services = services - .iter() - .map(|s| s.to_string()) - .collect::>(); - if actual_services != expected_services { - return Err(BoxError::from(format!( - "incomplete traces, got {actual_services:?} expected {expected_services:?}" - ))); + async fn find_valid_metrics(&self, mock_server: &MockServer) -> Result<(), BoxError> { + let requests = mock_server + .received_requests() + .await + .expect("Could not get otlp requests"); + if let Some(metrics) = requests.iter().find(|r| r.url.path().ends_with("/metrics")) { + let metrics = opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest::decode(bytes::Bytes::copy_from_slice(&metrics.body))?; + let json_metrics = serde_json::to_value(metrics)?; + // For now just validate service name. + self.verify_services(&json_metrics)?; + + Ok(()) + } else { + Err(anyhow!("No metrics received").into()) + } } - Ok(()) } -fn validate_service_name(trace: Value) -> Result<(), BoxError> { - let service_name = - trace.select_path("$..resource.attributes[?(@.key=='service.name')].value.stringValue")?; - assert_eq!( - service_name.first(), - Some(&&Value::String("router".to_string())) - ); - Ok(()) +async fn mock_otlp_server() -> MockServer { + let mock_server = wiremock::MockServer::start().await; + Mock::given(method("POST")) + .and(path("/v1/traces")) + .respond_with(ResponseTemplate::new(200).set_body_raw( + ExportTraceServiceResponse::default().encode_to_vec(), + "application/x-protobuf", + )) + .expect(1..) + .mount(&mock_server) + .await; + Mock::given(method("POST")) + .and(path("/metrics")) + .respond_with(ResponseTemplate::new(200).set_body_raw( + ExportMetricsServiceResponse::default().encode_to_vec(), + "application/x-protobuf", + )) + .expect(1..) + .mount(&mock_server) + .await; + mock_server } -async fn find_valid_metrics( - mock_server: &MockServer, - _query: &Value, - _operation_name: Option<&str>, - _services: &[&'static str], -) -> Result<(), BoxError> { - let requests = mock_server - .received_requests() - .await - .expect("Could not get otlp requests"); - if let Some(metrics) = requests.iter().find(|r| r.url.path().ends_with("/metrics")) { - let metrics = opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest::decode(bytes::Bytes::copy_from_slice(&metrics.body))?; - let json_trace = serde_json::to_value(metrics)?; - // For now just validate service name. - validate_service_name(json_trace)?; - - Ok(()) - } else { - Err(anyhow!("No metrics received").into()) +pub(crate) trait DatadogId { + fn to_datadog(&self) -> u64; +} +impl DatadogId for TraceId { + fn to_datadog(&self) -> u64 { + let bytes = &self.to_bytes()[std::mem::size_of::()..std::mem::size_of::()]; + u64::from_be_bytes(bytes.try_into().unwrap()) } } diff --git a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx index 0eea7691d9..e91b12508e 100644 --- a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx +++ b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx @@ -12,11 +12,16 @@ For general tracing configuration, refer to [Router Tracing Configuration](./ove ## OTLP configuration -To export traces to Datadog via OTLP, you must do the following: -- Configure the Datadog agent to accept OTLP traces. -- Configure the router to send traces to the Datadog agent. +OTLP is the [OpenTelemetry protocol](https://opentelemetry.io/docs/specs/otel/protocol/), and is the recommended protocol for transmitting telemetry, including traces, to Datadog. -To configure the Datadog agent, add OTLP configuration to your `datadog.yaml`. For example: +To setup traces to Datadog via OTLP, you must do the following: + +- Modify the default configuration of the Datadog Agent to accept OTLP traces submitted to it by the router. +- Configure the router to send traces to the configured Datadog Agent. + +### Datadog Agent configuration + +To configure the Datadog Agent, add OTLP configuration to your `datadog.yaml`. For example: ```yaml title="datadog.yaml" otlp_config: @@ -26,26 +31,42 @@ otlp_config: endpoint: :4317 ``` -To configure the router, enable the [OTLP exporter](./otlp) and set `endpoint: `. For example: +For additional Datadog Agent configuration details, review Datadog's [Enabling OTLP Ingestion on the Datadog Agent](https://docs.datadoghq.com/opentelemetry/interoperability/otlp_ingest_in_the_agent/?tab=host#enabling-otlp-ingestion-on-the-datadog-agent) documentation. + +### Router configuration + +To configure the router, enable the [OTLP exporter](./otlp) and set `endpoint: `. For example: ```yaml title="router.yaml" telemetry: exporters: tracing: + common: + # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! + sampler: 0.1 + otlp: enabled: true - # Optional endpoint, either 'default' or a URL (Defaults to http://127.0.0.1:4317) endpoint: "${env.DATADOG_AGENT_HOST}:4317" + # Optional batch processor setting, this will enable the batch processor to send concurrent requests in a high load scenario. + batch_processor: + max_concurrent_exports: 100 ``` -For more details about Datadog configuration, see [Datadog Agent configuration](https://docs.datadoghq.com/opentelemetry/otlp_ingest_in_the_agent/?tab=host). +Adjusting the `sampler` will allow you to control the sampling decisions that the router will make on its own and decrease the rate at which you sample, which can have a direct impact on your your Datadog bill. + + + +Depending on the volume of spans being created in a router instance, it will be necessary to adjust the `batch_processor` settings in your `exporter` config. If this is necessary, you will see warning messages from the router regarding the batch span processor. This applies to both OTLP and the Datadog native exporter. + + ### Enabling log correlation To enable Datadog log correlation, you must configure `dd.trace_id` to appear on the `router` span: - + ```yaml title="router.yaml" telemetry: instrumentation: @@ -72,10 +93,18 @@ The router can be configured to connect to either the native, default Datadog ag telemetry: exporters: tracing: - datadog: - enabled: true - # Optional endpoint, either 'default' or a URL (Defaults to http://127.0.0.1:8126) - endpoint: "http://${env.DATADOG_AGENT_HOST}:8126" + common: + # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! + sampler: 0.1 + + datadog: + enabled: true + # Optional endpoint, either 'default' or a URL (Defaults to http://127.0.0.1:8126) + endpoint: "http://${env.DATADOG_AGENT_HOST}:8126" + + # Optional batch processor setting, this will enable the batch processor to send concurrent requests in a high load scenario. + batch_processor: + max_concurrent_exports: 100 # Enable graphql.operation.name attribute on supergraph spans. instrumentation: @@ -86,6 +115,12 @@ telemetry: graphql.operation.name: true ``` + + +Depending on the volume of spans being created in a router instance, it will be necessary to adjust the `batch_processor` settings in your `exporter` config. This applies to both OTLP and the Datadog native exporter. + + + ### `enabled` Set to true to enable the Datadog exporter. Defaults to false. @@ -227,11 +262,11 @@ If you have introduced a new span in a custom build of the Router you can enable telemetry: exporters: tracing: - datadog: - batch_processor: + datadog: + batch_processor: max_export_batch_size: 512 max_concurrent_exports: 1 - max_export_timeout: 30s + max_export_timeout: 30s max_queue_size: 2048 scheduled_delay: 5s ``` From 42247b7b173829119cad724a9f069cf6d7053d72 Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 4 Oct 2024 09:23:27 +0100 Subject: [PATCH 02/26] Update docs --- .../telemetry/exporters/tracing/datadog.mdx | 47 ++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx index e91b12508e..1ded523ec2 100644 --- a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx +++ b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx @@ -43,6 +43,7 @@ telemetry: tracing: common: # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! + preview_datadog_agent_sampling: true sampler: 0.1 otlp: @@ -55,7 +56,7 @@ telemetry: max_concurrent_exports: 100 ``` -Adjusting the `sampler` will allow you to control the sampling decisions that the router will make on its own and decrease the rate at which you sample, which can have a direct impact on your your Datadog bill. +Adjusting the `sampler` will allow you to control the sampling decisions that the router will make on its own and decrease the rate at which you sample, which can have a direct impact on your Datadog bill. @@ -63,6 +64,28 @@ Depending on the volume of spans being created in a router instance, it will be +### Enabling Datadog Agent sampling + +The Datadog APM view relies on traces to generate metrics. For this to be accurate 100% of requests must be sampled and sent to the Datadog agent. +To prevent ALL traces from then being sent to Datadog, you must set `preview_datadog_agent_sampling` to `true` and adjust the `sampler` to the desired percentage of traces to be sent to Datadog. + +```yaml title="router.yaml" +telemetry: + exporters: + tracing: + common: + # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! + preview_datadog_agent_sampling: true + sampler: 0.1 +``` + + + + Using `preview_datadog_agent_sampling` will send ALL spans to the Datadog agent, but only the `sampler` percentage of them will be forwarded to Datadog. This means that your APM view will be correct at the cost of + the router sending more spans to the Datadog agent. This will have an impact on the resource usage and performance of both the Router and Datadog agent. + + + ### Enabling log correlation To enable Datadog log correlation, you must configure `dd.trace_id` to appear on the `router` span: @@ -95,6 +118,7 @@ telemetry: tracing: common: # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! + preview_datadog_agent_sampling: true sampler: 0.1 datadog: @@ -286,3 +310,24 @@ telemetry: | `resource_mapping` | See [config](#resource_mapping) | A map of span names to attribute names. | | `span_metrics` | See [config](#span_metrics) | A map of span names to boolean. | +## `preview_datadog_agent_sampling` (default: `false`) + +The Datadog APM view relies on traces to generate metrics. For this to be accurate 100% of requests must be sampled and sent to the Datadog agent. +To prevent ALL traces from then being sent to Datadog, you must set `preview_datadog_agent_sampling` to `true` and adjust the `sampler` to the desired percentage of traces to be sent to Datadog. + +```yaml title="router.yaml" +telemetry: + exporters: + tracing: + common: + # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! + preview_datadog_agent_sampling: true + sampler: 0.1 +``` + + + + Using `preview_datadog_agent_sampling` will send ALL spans to the Datadog agent, but only the `sampler` percentage of them will be forwarded to Datadog. This means that your APM view will be correct at the cost of + the router sending more spans to the Datadog agent. This will have an impact on the resource usage and performance of both the Router and Datadog agent. + + \ No newline at end of file From 7ad4d02e22586cf37c89fa2909e163b9046e52d0 Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 4 Oct 2024 10:20:45 +0100 Subject: [PATCH 03/26] Update docs --- .../telemetry/exporters/tracing/datadog.mdx | 82 ++++++++++++++++++- 1 file changed, 80 insertions(+), 2 deletions(-) diff --git a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx index 1ded523ec2..3c22af6b88 100644 --- a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx +++ b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx @@ -310,7 +310,83 @@ telemetry: | `resource_mapping` | See [config](#resource_mapping) | A map of span names to attribute names. | | `span_metrics` | See [config](#span_metrics) | A map of span names to boolean. | -## `preview_datadog_agent_sampling` (default: `false`) +## Sampler configuration + +When using Datadog you will need to take into consideration if you want to use the Datadog APM view or rely on OTLP metrics to gain insight into the Router's performance. +The Datadog APM vie is driven by traces, and for this to be accurate 100% of requests must be sampled and sent to the Datadog agent. + +Tracing is expensive both in terms of APM costs but also Router performance, and typically you will want to set the `sampler` to a low value ion production environments. +However, this will mean that the APM view will only show a small percentage of traces. + +Datadog agent sampling is a mode where ALL traces are sent to the Datadog agent, but only a percentage of them are forwarded to Datadog. This makes the APM view accurate while keeping costs low +at the cost of the Router having an effective sample rate of 100% under the hood. + +Here are some guides on how to configure the `sampler` and `preview_datadog_agent_sampling` to get the desired behavior: + +**I want the APM view to show metrics for 100% of traffic, and I am OK with the performance impact on the Router.** + +Set `preview_datadog_agent_sampling` to `true` and adjust the `sampler` to the desired percentage of traces to be sent to Datadog. + +```yaml title="router.yaml" +telemetry: + exporters: + tracing: + common: + # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! + preview_datadog_agent_sampling: true + sampler: 0.1 +``` + +**I want the Datadog agent to be in control of the percentage of traces sent to Datadog.** + +Use the Datadog agent `probabalistic_sampling` option sampler and set the `sampler` to `1` to allow the Datadog agent to control the sampling rate. + +Router config: +```yaml title="router.yaml" +telemetry: + exporters: + tracing: + common: + sampler: always_on +``` + +Datadog agent config: +```yaml +otlp_config: + traces: + probabilistic_sampling: + sampling_percentage: 10 +``` + +**I want the most performance from the Router and am not concerned with the APM view. I use metrics and traces to monitor my application.** + +Set the `sample` to a low value to reduce the number of traces sent to Datadog. Leave `preview_datadog_agent_sampling` to `false`. + +```yaml title="router.yaml" +telemetry: + exporters: + tracing: + common: + sampler: 0.1 + preview_datadog_agent_sampling: false +``` + +### `sampler` (default: `always_on`) + +The `sampler` configuration allows you to control the sampling decisions that the router will make on its own and decrease the rate at which you sample, which can have a direct impact on your Datadog bill. + +```yaml title="router.yaml" +telemetry: + exporters: + tracing: + common: + # Only 10 percent of spans will be forwarded to the Datadog agent. Experiment to find a value that is good for you! + sampler: 0.1 +``` + +If you are using the Datadog APM viw then you should set `preview_datadog_agent_sampling` to `true` and adjust the `sampler` to the desired percentage of traces to be sent to Datadog. + +### `preview_datadog_agent_sampling` (default: `false`) The Datadog APM view relies on traces to generate metrics. For this to be accurate 100% of requests must be sampled and sent to the Datadog agent. To prevent ALL traces from then being sent to Datadog, you must set `preview_datadog_agent_sampling` to `true` and adjust the `sampler` to the desired percentage of traces to be sent to Datadog. @@ -328,6 +404,8 @@ telemetry: Using `preview_datadog_agent_sampling` will send ALL spans to the Datadog agent, but only the `sampler` percentage of them will be forwarded to Datadog. This means that your APM view will be correct at the cost of - the router sending more spans to the Datadog agent. This will have an impact on the resource usage and performance of both the Router and Datadog agent. + the Router sending more spans to the Datadog agent. This will have an impact on the resource usage and performance of both the Router and Datadog agent. + + If you are OK with your APM view only showing a subset of traces, then you can leave `preview_datadog_agent_sampling` to `false`, however it is recommended to rely on OTLP metrics to gain insight into the Router's performance. \ No newline at end of file From d8bf3528c00d56819b9ad4fadbdba5686564f9c2 Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 4 Oct 2024 10:47:47 +0100 Subject: [PATCH 04/26] Update docs --- .../telemetry/exporters/tracing/datadog.mdx | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx index 3c22af6b88..4de2c86759 100644 --- a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx +++ b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx @@ -313,12 +313,12 @@ telemetry: ## Sampler configuration When using Datadog you will need to take into consideration if you want to use the Datadog APM view or rely on OTLP metrics to gain insight into the Router's performance. -The Datadog APM vie is driven by traces, and for this to be accurate 100% of requests must be sampled and sent to the Datadog agent. +The Datadog APM view is driven by traces, and for this to be accurate 100% of requests must be sampled and sent to the Datadog agent. Tracing is expensive both in terms of APM costs but also Router performance, and typically you will want to set the `sampler` to a low value ion production environments. However, this will mean that the APM view will only show a small percentage of traces. -Datadog agent sampling is a mode where ALL traces are sent to the Datadog agent, but only a percentage of them are forwarded to Datadog. This makes the APM view accurate while keeping costs low +Datadog Agent sampling is a mode where ALL traces are sent to the Datadog agent, but only a percentage of them are forwarded to Datadog. This makes the APM view accurate while keeping costs low at the cost of the Router having an effective sample rate of 100% under the hood. Here are some guides on how to configure the `sampler` and `preview_datadog_agent_sampling` to get the desired behavior: @@ -332,7 +332,8 @@ telemetry: exporters: tracing: common: - # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! + # All requests will be traced and sent to the Datadog agent. + # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. preview_datadog_agent_sampling: true sampler: 0.1 ``` @@ -347,6 +348,7 @@ telemetry: exporters: tracing: common: + # All requests will be traced and sent to the Datadog agent. sampler: always_on ``` @@ -355,6 +357,7 @@ Datadog agent config: otlp_config: traces: probabilistic_sampling: + # Only 10 percent of spans will be forwarded to Datadog sampling_percentage: 10 ``` @@ -367,6 +370,7 @@ telemetry: exporters: tracing: common: + # Only 10 percent of requests will be traced and sent to the Datadog agent. The APM view will only show a subset of total request data but the Router will perform better. sampler: 0.1 preview_datadog_agent_sampling: false ``` From 110917cac72c32571031e1914e1d589700bf51b8 Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 4 Oct 2024 10:55:31 +0100 Subject: [PATCH 05/26] Update docs --- .../configuration/telemetry/exporters/tracing/datadog.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx index 4de2c86759..c98d570142 100644 --- a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx +++ b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx @@ -321,7 +321,7 @@ However, this will mean that the APM view will only show a small percentage of t Datadog Agent sampling is a mode where ALL traces are sent to the Datadog agent, but only a percentage of them are forwarded to Datadog. This makes the APM view accurate while keeping costs low at the cost of the Router having an effective sample rate of 100% under the hood. -Here are some guides on how to configure the `sampler` and `preview_datadog_agent_sampling` to get the desired behavior: +Use the following guidelines on how to configure the `sampler` and `preview_datadog_agent_sampling` to get the desired behavior: **I want the APM view to show metrics for 100% of traffic, and I am OK with the performance impact on the Router.** @@ -340,7 +340,7 @@ telemetry: **I want the Datadog agent to be in control of the percentage of traces sent to Datadog.** -Use the Datadog agent `probabalistic_sampling` option sampler and set the `sampler` to `1` to allow the Datadog agent to control the sampling rate. +Use the Datadog agent `probabalistic_sampling` option sampler and set the `sampler` to `always_on` to allow the Datadog agent to control the sampling rate. Router config: ```yaml title="router.yaml" From 562b7ac0456a011a348d32d0cd9fe4114b30d1c8 Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 4 Oct 2024 11:12:22 +0100 Subject: [PATCH 06/26] Split the changelogs --- ....md => fix_bryn_datadog_agent_sampling.md} | 19 ++++--------------- ..._upstream_sampling_decision_propagation.md | 7 +++++++ 2 files changed, 11 insertions(+), 15 deletions(-) rename .changesets/{fix_bryn_datadog_upstream_sampling_decision_test.md => fix_bryn_datadog_agent_sampling.md} (57%) create mode 100644 .changesets/fix_bryn_datadog_upstream_sampling_decision_propagation.md diff --git a/.changesets/fix_bryn_datadog_upstream_sampling_decision_test.md b/.changesets/fix_bryn_datadog_agent_sampling.md similarity index 57% rename from .changesets/fix_bryn_datadog_upstream_sampling_decision_test.md rename to .changesets/fix_bryn_datadog_agent_sampling.md index bb6447a66b..1bea4a5277 100644 --- a/.changesets/fix_bryn_datadog_upstream_sampling_decision_test.md +++ b/.changesets/fix_bryn_datadog_agent_sampling.md @@ -1,11 +1,6 @@ -### Respect x-datadog-sampling-priority ([PR #6017](https://github.com/apollographql/router/pull/6017)) +### Add `preview_datadog_agent_sampling` ([PR #6017](https://github.com/apollographql/router/pull/6017)) -This PR consists of two fixes: -#### Datadog priority sampling resolution is not lost. - -Previously a `x-datadog-sampling-priority` of `-1` would be converted to `0` for downstream requests and `2` would be converted to `1`. - -#### The sampler option in the `telemetry.exporters.tracing.common.sampler` is not datadog aware. +The sampler option in the `telemetry.exporters.tracing.common.sampler` is not datadog aware. To get accurate APM metrics all spans must be sent to the datadog agent with a `psr` or `sampling.priority` attribute set appropriately to record the sampling decision. @@ -24,25 +19,19 @@ telemetry: # Example OTLP exporter configuration otlp: enabled: true - # Optional batch processor setting, this will enable the batch processor to send concurrent requests in a high load scenario. - batch_processor: - max_concurrent_exports: 100 # Example Datadog native exporter configuration datadog: enabled: true - # Optional batch processor setting, this will enable the batch processor to send concurrent requests in a high load scenario. - batch_processor: - max_concurrent_exports: 100 ``` By using these options, you can decrease your Datadog bill as you will only be sending a percentage of spans from the Datadog agent to datadog. > [!IMPORTANT] -> Users must enable `preview_datadog_agent_sampling` to get accurate APM metrics. +> Users must enable `preview_datadog_agent_sampling` to get accurate APM metrics. Users that have been using recent versions of the router will have to modify their configuration to retain full APM metrics. > [!IMPORTANT] > Sending all spans to the datadog agent may require that you tweak the `batch_processor` settings in your exporter config. This applies to both OTLP and the Datadog native exporter. -By [@BrynCooke](https://github.com/BrynCooke) in https://github.com/apollographql/router/pull/6017 +See the updated Datadog tracing documentation for more information on configuration options and their implications. \ No newline at end of file diff --git a/.changesets/fix_bryn_datadog_upstream_sampling_decision_propagation.md b/.changesets/fix_bryn_datadog_upstream_sampling_decision_propagation.md new file mode 100644 index 0000000000..c67d135689 --- /dev/null +++ b/.changesets/fix_bryn_datadog_upstream_sampling_decision_propagation.md @@ -0,0 +1,7 @@ +### Datadog priority sampling resolution is lost ([PR #6017](https://github.com/apollographql/router/pull/6017)) + +Previously a `x-datadog-sampling-priority` of `-1` would be converted to `0` for downstream requests and `2` would be converted to `1`. +This means that when propagating to downstream services a value of USER_REJECT would be transmitted as AUTO_REJECT. + +This PR fixes this by ensuring that the `x-datadog-sampling-priority` is transmitted as is to downstream services. + From 1776e5983a93e523cbd9dce31dafaeefe209fb02 Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 4 Oct 2024 11:13:01 +0100 Subject: [PATCH 07/26] Remove commented out code for auto enabling sampling --- apollo-router/src/plugins/telemetry/mod.rs | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index fa6fa6494f..dabde7206a 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -286,20 +286,6 @@ impl Plugin for Telemetry { .expect("otel error handler lock poisoned, fatal"); let mut config = init.config; - // This code would have enabled datadog agent sampling by default, but for now we will leave it as opt-in. - // If the datadog exporter is enabled then enable the agent sampler. - // If users are using otlp export then they will need to set this explicitly in their config. - // - // if config.exporters.tracing.datadog.enabled() - // && config - // .exporters - // .tracing - // .common - // .preview_datadog_agent_sampling - // .is_none() - // { - // config.exporters.tracing.common.preview_datadog_agent_sampling = Some(true); - // } config.instrumentation.spans.update_defaults(); config.instrumentation.instruments.update_defaults(); config.exporters.logging.validate()?; From 9b60c38d45a827a0e09f9d3163b9d0e84b734735 Mon Sep 17 00:00:00 2001 From: bryn Date: Wed, 16 Oct 2024 10:07:05 +0100 Subject: [PATCH 08/26] Update docs to indicate that preview datadog agent sampling will NOT use rate based sampling from agent. --- .../fix_bryn_datadog_agent_sampling.md | 13 ++--- .../telemetry/exporters/tracing/datadog.mdx | 11 +++-- .../telemetry/exporters/tracing/overview.mdx | 49 ++++++++++++++----- 3 files changed, 49 insertions(+), 24 deletions(-) diff --git a/.changesets/fix_bryn_datadog_agent_sampling.md b/.changesets/fix_bryn_datadog_agent_sampling.md index 1bea4a5277..71d6306017 100644 --- a/.changesets/fix_bryn_datadog_agent_sampling.md +++ b/.changesets/fix_bryn_datadog_agent_sampling.md @@ -15,14 +15,7 @@ telemetry: sampler: 0.1 # Send all spans to the Datadog agent. preview_datadog_agent_sampling: true - - # Example OTLP exporter configuration - otlp: - enabled: true - - # Example Datadog native exporter configuration - datadog: - enabled: true + ``` @@ -31,6 +24,10 @@ By using these options, you can decrease your Datadog bill as you will only be s > [!IMPORTANT] > Users must enable `preview_datadog_agent_sampling` to get accurate APM metrics. Users that have been using recent versions of the router will have to modify their configuration to retain full APM metrics. +> [!IMPORTANT] +> The Router does not support [`in-agent` ingestion control](https://docs.datadoghq.com/tracing/trace_pipeline/ingestion_mechanisms/?tab=java#in-the-agent). +> Configuring `traces_per_second` in the Datadog agent will NOT dynamically adjust the Router's sampling rate to meet the target rate. + > [!IMPORTANT] > Sending all spans to the datadog agent may require that you tweak the `batch_processor` settings in your exporter config. This applies to both OTLP and the Datadog native exporter. diff --git a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx index c98d570142..1e1d5d0013 100644 --- a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx +++ b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx @@ -75,15 +75,16 @@ telemetry: tracing: common: # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! - preview_datadog_agent_sampling: true sampler: 0.1 + preview_datadog_agent_sampling: true ``` - - Using `preview_datadog_agent_sampling` will send ALL spans to the Datadog agent, but only the `sampler` percentage of them will be forwarded to Datadog. This means that your APM view will be correct at the cost of - the router sending more spans to the Datadog agent. This will have an impact on the resource usage and performance of both the Router and Datadog agent. - + The Router does not support [`in-agent` ingestion control](https://docs.datadoghq.com/tracing/trace_pipeline/ingestion_mechanisms/?tab=java#in-the-agent). + Configuring `traces_per_second` in the Datadog agent will not dynamically adjust the Router's sampling rate to meet the target rate. + + + Using `preview_datadog_agent_sampling` will send ALL spans to the Datadog agent. This will have an impact on the resource usage and performance of both the Router and Datadog agent. ### Enabling log correlation diff --git a/docs/source/configuration/telemetry/exporters/tracing/overview.mdx b/docs/source/configuration/telemetry/exporters/tracing/overview.mdx index c7b81cca70..d9663a8ec6 100644 --- a/docs/source/configuration/telemetry/exporters/tracing/overview.mdx +++ b/docs/source/configuration/telemetry/exporters/tracing/overview.mdx @@ -111,6 +111,29 @@ telemetry: - `parent_based_sampler` enables clients to make the sampling decision. This guarantees that a trace that starts at a client will also have spans at the router. You may wish to disable it (setting `parent_based_sampler: false`) if your router is exposed directly to the internet. +### `preview_datadog_agent_sampling` + +The Datadog APM view relies on traces to generate metrics. For this to be accurate 100% of requests must be sampled and sent to the Datadog agent. +To prevent ALL traces from then being sent to Datadog, you must set `preview_datadog_agent_sampling` to `true` and adjust the `sampler` to the desired percentage of traces to be sent to Datadog. + +```yaml title="router.yaml" +telemetry: + exporters: + tracing: + common: + # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! + sampler: 0.1 + preview_datadog_agent_sampling: true +``` + + + The Router does not support [`in-agent` ingestion control](https://docs.datadoghq.com/tracing/trace_pipeline/ingestion_mechanisms/?tab=java#in-the-agent). + Configuring `traces_per_second` in the Datadog agent will not dynamically adjust the Router's sampling rate to meet the target rate. + + + Using `preview_datadog_agent_sampling` will send ALL spans to the Datadog agent. This will have an impact on the resource usage and performance of both the Router and Datadog agent. + + ### `propagation` The `telemetry.exporters.tracing.propagation` section allows you to configure which propagators are active in addition to those automatically activated by using an exporter. @@ -235,17 +258,21 @@ Using this configuration you will have a response header called `my-trace-id` co ## Tracing common reference -| Attribute | Default | Description | -|----------------------------------|--------------------------|-------------------------------------------------| -| `service_name` | `unknown_service:router` | The OpenTelemetry service name. | -| `service_namespace` | | The OpenTelemetry namespace. | -| `resource` | | The OpenTelemetry resource to attach to traces. | -| `experimental_response_trace_id` | | Return the trace ID in a response header. | -| `max_attributes_per_event` | 128 | The maximum number of attributes per event. | -| `max_attributes_per_link` | 128 | The maximum number of attributes per link. | -| `max_attributes_per_span` | 128 | The maximum number of attributes per span. | -| `max_events_per_span` | 128 | The maximum number of events per span. | -| `max_links_per_span` | 128 | The maximum links per span. | +| Attribute | Default | Description | +|----------------------------------|--------------------------|--------------------------------------------------| +| `parent_based_sampler` | `true` | Sampling decisions from upstream will be honored | +| `preview_datadog_agent_sampling` | `false` | Send all spans to the Datadog agent. | +| `propagation` | | The propagation configuration. | +| `sampler` | `always_on` | The sampling rate for traces. | +| `service_name` | `unknown_service:router` | The OpenTelemetry service name. | +| `service_namespace` | | The OpenTelemetry namespace. | +| `resource` | | The OpenTelemetry resource to attach to traces. | +| `experimental_response_trace_id` | | Return the trace ID in a response header. | +| `max_attributes_per_event` | 128 | The maximum number of attributes per event. | +| `max_attributes_per_link` | 128 | The maximum number of attributes per link. | +| `max_attributes_per_span` | 128 | The maximum number of attributes per span. | +| `max_events_per_span` | 128 | The maximum number of events per span. | +| `max_links_per_span` | 128 | The maximum links per span. | ## Related topics From 16cd222a91a8f14e56f73991192e9f0d1726c986 Mon Sep 17 00:00:00 2001 From: Bryn Cooke Date: Fri, 25 Oct 2024 09:48:34 +0100 Subject: [PATCH 09/26] Update docs/source/configuration/telemetry/exporters/tracing/datadog.mdx Co-authored-by: Edward Huang --- .../configuration/telemetry/exporters/tracing/datadog.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx index 1e1d5d0013..fcd32dc4a0 100644 --- a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx +++ b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx @@ -362,7 +362,7 @@ otlp_config: sampling_percentage: 10 ``` -**I want the most performance from the Router and am not concerned with the APM view. I use metrics and traces to monitor my application.** +**I want the best performance from the router and I'm not concerned with the APM view. I use metrics and traces to monitor my application.** Set the `sample` to a low value to reduce the number of traces sent to Datadog. Leave `preview_datadog_agent_sampling` to `false`. From d7532cedcdccc0057f948910a54aa9b9e323b1ed Mon Sep 17 00:00:00 2001 From: Bryn Cooke Date: Fri, 25 Oct 2024 09:48:47 +0100 Subject: [PATCH 10/26] Update docs/source/configuration/telemetry/exporters/tracing/datadog.mdx Co-authored-by: Edward Huang --- .../configuration/telemetry/exporters/tracing/datadog.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx index fcd32dc4a0..f81989500c 100644 --- a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx +++ b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx @@ -12,7 +12,7 @@ For general tracing configuration, refer to [Router Tracing Configuration](./ove ## OTLP configuration -OTLP is the [OpenTelemetry protocol](https://opentelemetry.io/docs/specs/otel/protocol/), and is the recommended protocol for transmitting telemetry, including traces, to Datadog. +[OpenTelemetry protocol (OTLP)](https://opentelemetry.io/docs/specs/otel/protocol/) is the recommended protocol for transmitting telemetry, including traces, to Datadog. To setup traces to Datadog via OTLP, you must do the following: From c91a2338af39d537521839375004151247e85027 Mon Sep 17 00:00:00 2001 From: Bryn Cooke Date: Fri, 25 Oct 2024 09:49:22 +0100 Subject: [PATCH 11/26] Apply suggestions from code review Co-authored-by: Edward Huang --- .changesets/fix_bryn_datadog_agent_sampling.md | 18 +++++++++--------- ...g_upstream_sampling_decision_propagation.md | 7 +++---- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/.changesets/fix_bryn_datadog_agent_sampling.md b/.changesets/fix_bryn_datadog_agent_sampling.md index 71d6306017..50c2d6997d 100644 --- a/.changesets/fix_bryn_datadog_agent_sampling.md +++ b/.changesets/fix_bryn_datadog_agent_sampling.md @@ -1,10 +1,10 @@ -### Add `preview_datadog_agent_sampling` ([PR #6017](https://github.com/apollographql/router/pull/6017)) +### Enable accurate Datadog APM metrics ([PR #6017](https://github.com/apollographql/router/pull/6017)) -The sampler option in the `telemetry.exporters.tracing.common.sampler` is not datadog aware. +The router supports a new preview feature, the `preview_datadog_agent_sampling` option, to enable sending all spans to the Datadog Agent so APM metrics and views are accurate. -To get accurate APM metrics all spans must be sent to the datadog agent with a `psr` or `sampling.priority` attribute set appropriately to record the sampling decision. +Previously, the sampler option in `telemetry.exporters.tracing.common.sampler` wasn't Datadog-aware. To get accurate Datadog APM metrics, all spans must be sent to the Datadog Agent with a `psr` or `sampling.priority` attribute set appropriately to record the sampling decision. -`preview_datadog_agent_sampling` option in the router.yaml enables this behavior and should be used when exporting to the datadog agent via OTLP or datadog native. +The `preview_datadog_agent_sampling` option enables accurate Datadog APM metrics. It should be used when exporting to the Datadog Agent, via OTLP or Datadog-native. ```yaml telemetry: @@ -19,16 +19,16 @@ telemetry: ``` -By using these options, you can decrease your Datadog bill as you will only be sending a percentage of spans from the Datadog agent to datadog. +Using these options can decrease your Datadog bill, because you will be sending only a percentage of spans from the Datadog Agent to Datadog. > [!IMPORTANT] > Users must enable `preview_datadog_agent_sampling` to get accurate APM metrics. Users that have been using recent versions of the router will have to modify their configuration to retain full APM metrics. > [!IMPORTANT] -> The Router does not support [`in-agent` ingestion control](https://docs.datadoghq.com/tracing/trace_pipeline/ingestion_mechanisms/?tab=java#in-the-agent). -> Configuring `traces_per_second` in the Datadog agent will NOT dynamically adjust the Router's sampling rate to meet the target rate. +> The router doesn't support [`in-agent` ingestion control](https://docs.datadoghq.com/tracing/trace_pipeline/ingestion_mechanisms/?tab=java#in-the-agent). +> Configuring `traces_per_second` in the Datadog Agent won't dynamically adjust the router's sampling rate to meet the target rate. > [!IMPORTANT] -> Sending all spans to the datadog agent may require that you tweak the `batch_processor` settings in your exporter config. This applies to both OTLP and the Datadog native exporter. +> Sending all spans to the Datadog Agent may require that you tweak the `batch_processor` settings in your exporter config. This applies to both OTLP and Datadog native exporters. -See the updated Datadog tracing documentation for more information on configuration options and their implications. \ No newline at end of file +Learn more by reading the [updated Datadog tracing documentation](https://apollographql.com/docs/router/configuration/telemetry/exporters/tracing/datadog) for more information on configuration options and their implications. \ No newline at end of file diff --git a/.changesets/fix_bryn_datadog_upstream_sampling_decision_propagation.md b/.changesets/fix_bryn_datadog_upstream_sampling_decision_propagation.md index c67d135689..d05f173528 100644 --- a/.changesets/fix_bryn_datadog_upstream_sampling_decision_propagation.md +++ b/.changesets/fix_bryn_datadog_upstream_sampling_decision_propagation.md @@ -1,7 +1,6 @@ -### Datadog priority sampling resolution is lost ([PR #6017](https://github.com/apollographql/router/pull/6017)) +### Fix transmitted header value for Datadog priority sampling resolution ([PR #6017](https://github.com/apollographql/router/pull/6017)) -Previously a `x-datadog-sampling-priority` of `-1` would be converted to `0` for downstream requests and `2` would be converted to `1`. -This means that when propagating to downstream services a value of USER_REJECT would be transmitted as AUTO_REJECT. +The router now transmits correct values of `x-datadog-sampling-priority` to downstream services. -This PR fixes this by ensuring that the `x-datadog-sampling-priority` is transmitted as is to downstream services. +Previously, an `x-datadog-sampling-priority` of `-1` was incorrectly converted to `0` for downstream requests, and `2` was incorrectly converted to `1`. When propagating to downstream services, this resulted in values of `USER_REJECT` being incorrectly transmitted as `AUTO_REJECT`. From 98624c70afa5b2bbe5e7c2cd8f9def02127d9fff Mon Sep 17 00:00:00 2001 From: Bryn Cooke Date: Fri, 25 Oct 2024 09:50:48 +0100 Subject: [PATCH 12/26] Apply suggestions from code review Co-authored-by: Edward Huang --- .../telemetry/exporters/tracing/datadog.mdx | 58 ++++++++++--------- .../telemetry/exporters/tracing/overview.mdx | 22 ++++--- 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx index f81989500c..27c41b3579 100644 --- a/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx +++ b/docs/source/configuration/telemetry/exporters/tracing/datadog.mdx @@ -16,7 +16,7 @@ For general tracing configuration, refer to [Router Tracing Configuration](./ove To setup traces to Datadog via OTLP, you must do the following: -- Modify the default configuration of the Datadog Agent to accept OTLP traces submitted to it by the router. +- Modify the default configuration of the Datadog Agent to accept OTLP traces from the router. - Configure the router to send traces to the configured Datadog Agent. ### Datadog Agent configuration @@ -35,14 +35,14 @@ For additional Datadog Agent configuration details, review Datadog's [Enabling O ### Router configuration -To configure the router, enable the [OTLP exporter](./otlp) and set `endpoint: `. For example: +To configure the router, enable the [OTLP exporter](/router/configuration/telemetry/exporters/tracing/otlp) and set `endpoint: `. For example: ```yaml title="router.yaml" telemetry: exporters: tracing: common: - # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! + # Configured to forward 10 percent of spans from the Datadog Agent to Datadog. Experiment to find a value that is good for you. preview_datadog_agent_sampling: true sampler: 0.1 @@ -56,35 +56,37 @@ telemetry: max_concurrent_exports: 100 ``` -Adjusting the `sampler` will allow you to control the sampling decisions that the router will make on its own and decrease the rate at which you sample, which can have a direct impact on your Datadog bill. +Adjusting the `sampler` controls the sampling decisions that the router makes on its own and decreases the rate at which you sample. Your sample rate can have a direct impact on your Datadog bill. -Depending on the volume of spans being created in a router instance, it will be necessary to adjust the `batch_processor` settings in your `exporter` config. If this is necessary, you will see warning messages from the router regarding the batch span processor. This applies to both OTLP and the Datadog native exporter. +If you see warning messages from the router regarding the batch span processor, you may need to adjust your `batch_processor` settings in your `exporter` config to match the volume of spans being created in a router instance. This applies to both OTLP and the Datadog native exporters. ### Enabling Datadog Agent sampling -The Datadog APM view relies on traces to generate metrics. For this to be accurate 100% of requests must be sampled and sent to the Datadog agent. -To prevent ALL traces from then being sent to Datadog, you must set `preview_datadog_agent_sampling` to `true` and adjust the `sampler` to the desired percentage of traces to be sent to Datadog. +The Datadog APM view relies on traces to generate metrics. For these metrics to be accurate, all requests must be sampled and sent to the Datadog agent. +To prevent all traces from being sent to Datadog, in your router you must set `preview_datadog_agent_sampling` to `true` and adjust the `sampler` to the desired percentage of traces to be sent to Datadog. ```yaml title="router.yaml" telemetry: exporters: tracing: common: - # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! + # Configured to forward 10 percent of spans from the Datadog Agent to Datadog. Experiment to find a value that is good for you. sampler: 0.1 preview_datadog_agent_sampling: true ``` - The Router does not support [`in-agent` ingestion control](https://docs.datadoghq.com/tracing/trace_pipeline/ingestion_mechanisms/?tab=java#in-the-agent). - Configuring `traces_per_second` in the Datadog agent will not dynamically adjust the Router's sampling rate to meet the target rate. - - - Using `preview_datadog_agent_sampling` will send ALL spans to the Datadog agent. This will have an impact on the resource usage and performance of both the Router and Datadog agent. + + - The router doesn't support [`in-agent` ingestion control](https://docs.datadoghq.com/tracing/trace_pipeline/ingestion_mechanisms/?tab=java#in-the-agent). + + - Configuring `traces_per_second` in the Datadog Agent will not dynamically adjust the router's sampling rate to meet the target rate. + + - Using `preview_datadog_agent_sampling` will send _all_ spans to the Datadog Agent. This will have an impact on the resource usage and performance of both the router and Datadog Agent. + ### Enabling log correlation @@ -118,7 +120,7 @@ telemetry: exporters: tracing: common: - # Only 10 percent of spans will be forwarded from the Datadog agent to Datadog. Experiment to find a value that is good for you! + # Configured to forward 10 percent of spans from the Datadog Agent to Datadog. Experiment to find a value that is good for you. preview_datadog_agent_sampling: true sampler: 0.1 @@ -313,18 +315,17 @@ telemetry: ## Sampler configuration -When using Datadog you will need to take into consideration if you want to use the Datadog APM view or rely on OTLP metrics to gain insight into the Router's performance. -The Datadog APM view is driven by traces, and for this to be accurate 100% of requests must be sampled and sent to the Datadog agent. +When using Datadog to gain insight into your router's performance, you need to decide whether to use the Datadog APM view or rely on OTLP metrics. +The Datadog APM view is driven by traces. In order for this view to be accurate, all requests must be sampled and sent to the Datadog Agent. -Tracing is expensive both in terms of APM costs but also Router performance, and typically you will want to set the `sampler` to a low value ion production environments. -However, this will mean that the APM view will only show a small percentage of traces. +Tracing is expensive both in terms of APM costs and router performance, so you typically will want to set the `sampler` to sample at low rates in production environments. +This, however, impacts the APM view, which will show only a small percentage of traces. -Datadog Agent sampling is a mode where ALL traces are sent to the Datadog agent, but only a percentage of them are forwarded to Datadog. This makes the APM view accurate while keeping costs low -at the cost of the Router having an effective sample rate of 100% under the hood. +To mitigate this, you can use Datadog Agent sampling mode, where _all_ traces are sent to the Datadog Agent but only a percentage of them are forwarded to Datadog. This keeps the APM view accurate while lowering costs. Note that the router will incur a performance cost of having an effective sample rate of 100%. Use the following guidelines on how to configure the `sampler` and `preview_datadog_agent_sampling` to get the desired behavior: -**I want the APM view to show metrics for 100% of traffic, and I am OK with the performance impact on the Router.** +**I want the APM view to show metrics for 100% of traffic, and I am OK with the performance impact on the router.** Set `preview_datadog_agent_sampling` to `true` and adjust the `sampler` to the desired percentage of traces to be sent to Datadog. @@ -339,9 +340,9 @@ telemetry: sampler: 0.1 ``` -**I want the Datadog agent to be in control of the percentage of traces sent to Datadog.** +**I want the Datadog Agent to be in control of the percentage of traces sent to Datadog.** -Use the Datadog agent `probabalistic_sampling` option sampler and set the `sampler` to `always_on` to allow the Datadog agent to control the sampling rate. +Use the Datadog Agent's `probabalistic_sampling` option sampler and set the `sampler` to `always_on` to allow the agent to control the sampling rate. Router config: ```yaml title="router.yaml" @@ -406,11 +407,14 @@ telemetry: sampler: 0.1 ``` - +Using `preview_datadog_agent_sampling` will send _all_ spans to the Datadog Agent, but only the percentage of traces configured by the `sampler` will be forwarded to Datadog. This means that your APM view will be accurate, but it will incur performance and resource usage costs for both the router and Datadog Agent to send and receive all spans. - Using `preview_datadog_agent_sampling` will send ALL spans to the Datadog agent, but only the `sampler` percentage of them will be forwarded to Datadog. This means that your APM view will be correct at the cost of - the Router sending more spans to the Datadog agent. This will have an impact on the resource usage and performance of both the Router and Datadog agent. +If your use case allows your APM view to show only a subset of traces, then you can set `preview_datadog_agent_sampling` to `false`. You should alternatively rely on OTLP metrics to gain insight into the router's performance. - If you are OK with your APM view only showing a subset of traces, then you can leave `preview_datadog_agent_sampling` to `false`, however it is recommended to rely on OTLP metrics to gain insight into the Router's performance. + + +- The router doesn't support [`in-agent` ingestion control](https://docs.datadoghq.com/tracing/trace_pipeline/ingestion_mechanisms/?tab=java#in-the-agent). + +- Configuring `traces_per_second` in the Datadog Agent will not dynamically adjust the router's sampling rate to meet the target rate. \ No newline at end of file diff --git a/docs/source/configuration/telemetry/exporters/tracing/overview.mdx b/docs/source/configuration/telemetry/exporters/tracing/overview.mdx index d9663a8ec6..5a10bad2bc 100644 --- a/docs/source/configuration/telemetry/exporters/tracing/overview.mdx +++ b/docs/source/configuration/telemetry/exporters/tracing/overview.mdx @@ -111,10 +111,20 @@ telemetry: - `parent_based_sampler` enables clients to make the sampling decision. This guarantees that a trace that starts at a client will also have spans at the router. You may wish to disable it (setting `parent_based_sampler: false`) if your router is exposed directly to the internet. + + ### `preview_datadog_agent_sampling` -The Datadog APM view relies on traces to generate metrics. For this to be accurate 100% of requests must be sampled and sent to the Datadog agent. -To prevent ALL traces from then being sent to Datadog, you must set `preview_datadog_agent_sampling` to `true` and adjust the `sampler` to the desired percentage of traces to be sent to Datadog. + + + + + +Enable accurate Datadog APM views with the `preview_datadog_agent_sampling` option. + +The Datadog APM view relies on traces to generate metrics. For this to be accurate, all requests must be sampled and sent to the Datadog Agent. + +To both enable accurate APM views and prevent _all_ traces from being sent to Datadog, you must set `preview_datadog_agent_sampling` to `true` and adjust the `sampler` to the desired percentage of traces to be sent to Datadog. ```yaml title="router.yaml" telemetry: @@ -126,13 +136,7 @@ telemetry: preview_datadog_agent_sampling: true ``` - - The Router does not support [`in-agent` ingestion control](https://docs.datadoghq.com/tracing/trace_pipeline/ingestion_mechanisms/?tab=java#in-the-agent). - Configuring `traces_per_second` in the Datadog agent will not dynamically adjust the Router's sampling rate to meet the target rate. - - - Using `preview_datadog_agent_sampling` will send ALL spans to the Datadog agent. This will have an impact on the resource usage and performance of both the Router and Datadog agent. - +To learn more details and limitations about this option, go to [`preview_datadog_agent_sampling`](/router/configuration/telemetry/exporters/tracing/datadog#preview_datadog_agent_sampling) in DataDog trace exporter docs. ### `propagation` From 30ffb131f670d5c7147a7f9ddc25598946f1ed7c Mon Sep 17 00:00:00 2001 From: bryn Date: Tue, 10 Dec 2024 11:20:17 +0000 Subject: [PATCH 13/26] Improve datadog and otlp sampling tests We made some weird configuration previously to force particular behaviour which when looked at subsequently didn't make any sense. In particular the otlp tests should have had otlp propagation with the datadog propagator also enabled. Also added a chack for the subgraph service to ensure it is correctly sampled. --- apollo-router/tests/common.rs | 20 +++- .../tests/integration/telemetry/datadog.rs | 113 ++++++------------ .../tests/integration/telemetry/otlp.rs | 84 +++++++++++-- 3 files changed, 121 insertions(+), 96 deletions(-) diff --git a/apollo-router/tests/common.rs b/apollo-router/tests/common.rs index 72010eeef9..2fa1d8a024 100644 --- a/apollo-router/tests/common.rs +++ b/apollo-router/tests/common.rs @@ -80,6 +80,7 @@ pub struct IntegrationTest { collect_stdio: Option<(tokio::sync::oneshot::Sender, regex::Regex)>, _subgraphs: wiremock::MockServer, telemetry: Telemetry, + extra_propagator: Telemetry, pub _tracer_provider_client: TracerProvider, pub _tracer_provider_subgraph: TracerProvider, @@ -105,11 +106,15 @@ struct TracedResponder { telemetry: Telemetry, subscriber_subgraph: Dispatch, subgraph_callback: Option>, + subgraph_context: Arc>>, } impl Respond for TracedResponder { fn respond(&self, request: &wiremock::Request) -> ResponseTemplate { let context = self.telemetry.extract_context(request); + *self.subgraph_context.lock().expect("lock poisoned") = + Some(context.span().span_context().clone()); + tracing_core::dispatcher::with_default(&self.subscriber_subgraph, || { let _context_guard = context.attach(); let span = info_span!("subgraph server"); @@ -285,11 +290,7 @@ impl Telemetry { context = context.with_remote_span_context(SpanContext::new( context.span().span_context().trace_id(), context.span().span_context().span_id(), - context - .span() - .span_context() - .trace_flags() - .with_sampled(true), + context.span().span_context().trace_flags(), true, state, )); @@ -316,7 +317,9 @@ impl IntegrationTest { pub async fn new( config: String, telemetry: Option, + extra_propagator: Option, responder: Option, + subgraph_context: Option>>>, collect_stdio: Option>, supergraph: Option, mut subgraph_overrides: HashMap, @@ -325,6 +328,7 @@ impl IntegrationTest { ) -> Self { let redis_namespace = Uuid::new_v4().to_string(); let telemetry = telemetry.unwrap_or_default(); + let extra_propagator = extra_propagator.unwrap_or_default(); let tracer_provider_client = telemetry.tracer_provider("client"); let subscriber_client = Self::dispatch(&tracer_provider_client); let tracer_provider_subgraph = telemetry.tracer_provider("subgraph"); @@ -355,7 +359,8 @@ impl IntegrationTest { ResponseTemplate::new(200).set_body_json(json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}))), telemetry: telemetry.clone(), subscriber_subgraph: Self::dispatch(&tracer_provider_subgraph), - subgraph_callback + subgraph_callback, + subgraph_context: subgraph_context.unwrap_or_default() }) .mount(&subgraphs) .await; @@ -390,6 +395,7 @@ impl IntegrationTest { subscriber_client, _tracer_provider_subgraph: tracer_provider_subgraph, telemetry, + extra_propagator, redis_namespace, log: log.unwrap_or_else(|| "error,apollo_router=info".to_owned()), } @@ -595,6 +601,7 @@ impl IntegrationTest { "router was not started, call `router.start().await; router.assert_started().await`" ); let telemetry = self.telemetry.clone(); + let extra_propagator = self.extra_propagator.clone(); let query = query.clone(); let url = format!("http://{}", self.bind_address()); @@ -624,6 +631,7 @@ impl IntegrationTest { let mut request = builder.json(&query).build().unwrap(); telemetry.inject_context(&mut request); + extra_propagator.inject_context(&mut request); match client.execute(request).await { Ok(response) => (span_id, response), Err(err) => { diff --git a/apollo-router/tests/integration/telemetry/datadog.rs b/apollo-router/tests/integration/telemetry/datadog.rs index 39757ee389..948ff9d165 100644 --- a/apollo-router/tests/integration/telemetry/datadog.rs +++ b/apollo-router/tests/integration/telemetry/datadog.rs @@ -8,9 +8,7 @@ use std::time::Duration; use anyhow::anyhow; use opentelemetry_api::trace::SpanContext; -use opentelemetry_api::trace::TraceContextExt; use opentelemetry_api::trace::TraceId; -use opentelemetry_api::Context; use serde_json::json; use serde_json::Value; use tower::BoxError; @@ -30,6 +28,8 @@ struct TraceSpec { measured_spans: HashSet<&'static str>, unmeasured_spans: HashSet<&'static str>, priority_sampled: Option<&'static str>, + subgraph_sampled: Option, + subgraph_context: Option>>>, // Not the metrics but the otel attribute no_priority_sampled_attribute: Option, } @@ -40,19 +40,13 @@ async fn test_no_sample() -> Result<(), BoxError> { return Ok(()); } let context = std::sync::Arc::new(std::sync::Mutex::new(None)); - let context_clone = context.clone(); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Datadog) .config(include_str!("fixtures/datadog_no_sample.router.yaml")) .responder(ResponseTemplate::new(200).set_body_json( json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), )) - .subgraph_callback(Box::new(move || { - let context = Context::current(); - let span = context.span(); - let span_context = span.span_context(); - *context_clone.lock().expect("poisoned") = Some(span_context.clone()); - })) + .subgraph_context(context.clone()) .build() .await; @@ -63,14 +57,12 @@ async fn test_no_sample() -> Result<(), BoxError> { let (_id, result) = router.execute_untraced_query(&query, None).await; router.graceful_shutdown().await; assert!(result.status().is_success()); - let context = context - .lock() - .expect("poisoned") - .as_ref() - .expect("state") - .clone(); - assert!(context.is_sampled()); - assert_eq!(context.trace_state().get("psr"), Some("0")); + let context = context.lock().expect("poisoned"); + assert!(!context.as_ref().unwrap().is_sampled()); + assert_eq!( + context.as_ref().unwrap().trace_state().get("psr"), + Some("0") + ); Ok(()) } @@ -82,7 +74,6 @@ async fn test_sampling_datadog_agent_disabled() -> Result<(), BoxError> { return Ok(()); } let context = std::sync::Arc::new(std::sync::Mutex::new(None)); - let context_clone = context.clone(); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Datadog) .config(include_str!( @@ -91,12 +82,7 @@ async fn test_sampling_datadog_agent_disabled() -> Result<(), BoxError> { .responder(ResponseTemplate::new(200).set_body_json( json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), )) - .subgraph_callback(Box::new(move || { - let context = Context::current(); - let span = context.span(); - let span_context = span.span_context(); - *context_clone.lock().expect("poisoned") = Some(span_context.clone()); - })) + .subgraph_context(context.clone()) .build() .await; @@ -107,20 +93,14 @@ async fn test_sampling_datadog_agent_disabled() -> Result<(), BoxError> { let (id, result) = router.execute_untraced_query(&query, None).await; router.graceful_shutdown().await; assert!(result.status().is_success()); - let _context = context - .lock() - .expect("poisoned") - .as_ref() - .expect("state") - .clone(); - tokio::time::sleep(Duration::from_secs(2)).await; + TraceSpec::builder() .services([].into()) + .subgraph_sampled(false) .build() .validate_trace(id) .await?; - Ok(()) } @@ -130,19 +110,13 @@ async fn test_priority_sampling_propagated() -> Result<(), BoxError> { return Ok(()); } let context = std::sync::Arc::new(std::sync::Mutex::new(None)); - let context_clone = context.clone(); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Datadog) .config(include_str!("fixtures/datadog.router.yaml")) .responder(ResponseTemplate::new(200).set_body_json( json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), )) - .subgraph_callback(Box::new(move || { - let context = Context::current(); - let span = context.span(); - let span_context = span.span_context(); - *context_clone.lock().expect("poisoned") = Some(span_context.clone()); - })) + .subgraph_context(context.clone()) .build() .await; @@ -151,41 +125,41 @@ async fn test_priority_sampling_propagated() -> Result<(), BoxError> { // Parent based sampling. psr MUST be populated with the value that we pass in. test_psr( - &context, &mut router, Some("-1"), TraceSpec::builder() - .services(["client", "router", "subgraph"].into()) + .services(["client", "router"].into()) + .subgraph_sampled(false) .priority_sampled("-1") .build(), ) .await?; test_psr( - &context, &mut router, Some("0"), TraceSpec::builder() - .services(["client", "router", "subgraph"].into()) + .services(["client", "router"].into()) + .subgraph_sampled(false) .priority_sampled("0") .build(), ) .await?; test_psr( - &context, &mut router, Some("1"), TraceSpec::builder() .services(["client", "router", "subgraph"].into()) + .subgraph_sampled(true) .priority_sampled("1") .build(), ) .await?; test_psr( - &context, &mut router, Some("2"), TraceSpec::builder() .services(["client", "router", "subgraph"].into()) + .subgraph_sampled(true) .priority_sampled("2") .build(), ) @@ -193,11 +167,11 @@ async fn test_priority_sampling_propagated() -> Result<(), BoxError> { // No psr was passed in the router is free to set it. This will be 1 as we are going to sample here. test_psr( - &context, &mut router, None, TraceSpec::builder() .services(["client", "router", "subgraph"].into()) + .subgraph_sampled(true) .priority_sampled("1") .build(), ) @@ -214,19 +188,13 @@ async fn test_priority_sampling_propagated_otel_request() -> Result<(), BoxError return Ok(()); } let context = std::sync::Arc::new(std::sync::Mutex::new(None)); - let context_clone = context.clone(); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Otlp { endpoint: None }) .config(include_str!("fixtures/datadog.router.yaml")) .responder(ResponseTemplate::new(200).set_body_json( json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), )) - .subgraph_callback(Box::new(move || { - let context = Context::current(); - let span = context.span(); - let span_context = span.span_context(); - *context_clone.lock().expect("poisoned") = Some(span_context.clone()); - })) + .subgraph_context(context.clone()) .build() .await; @@ -261,7 +229,6 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { return Ok(()); } let context = std::sync::Arc::new(std::sync::Mutex::new(None)); - let context_clone = context.clone(); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Datadog) .config(include_str!( @@ -270,12 +237,7 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { .responder(ResponseTemplate::new(200).set_body_json( json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), )) - .subgraph_callback(Box::new(move || { - let context = Context::current(); - let span = context.span(); - let span_context = span.span_context(); - *context_clone.lock().expect("poisoned") = Some(span_context.clone()); - })) + .subgraph_context(context.clone()) .build() .await; @@ -284,7 +246,6 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { // The router will ignore the upstream PSR as parent based sampling is disabled. test_psr( - &context, &mut router, Some("-1"), TraceSpec::builder() @@ -294,7 +255,6 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { ) .await?; test_psr( - &context, &mut router, Some("0"), TraceSpec::builder() @@ -304,7 +264,6 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { ) .await?; test_psr( - &context, &mut router, Some("1"), TraceSpec::builder() @@ -314,7 +273,6 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { ) .await?; test_psr( - &context, &mut router, Some("2"), TraceSpec::builder() @@ -325,7 +283,6 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { .await?; test_psr( - &context, &mut router, None, TraceSpec::builder() @@ -341,7 +298,6 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { } async fn test_psr( - context: &Arc>>, router: &mut IntegrationTest, psr: Option<&str>, trace_spec: TraceSpec, @@ -355,19 +311,7 @@ async fn test_psr( let (id, result) = router .execute_query_with_headers(&query, headers.into_iter().collect()) .await; - assert!(result.status().is_success()); - let context = context - .lock() - .expect("poisoned") - .as_ref() - .expect("state") - .clone(); - - assert_eq!( - context.trace_state().get("psr"), - trace_spec.priority_sampled - ); trace_spec.validate_trace(id).await?; Ok(()) } @@ -823,6 +767,19 @@ impl TraceSpec { tokio::time::sleep(Duration::from_millis(100)).await; } self.find_valid_trace(&url).await?; + + if let Some(subgraph_context) = &self.subgraph_context { + let subgraph_context = subgraph_context.lock().expect("poisoned"); + let subgraph_span_context = subgraph_context.as_ref().expect("state").clone(); + + assert_eq!( + subgraph_span_context.trace_state().get("psr"), + self.priority_sampled + ); + if let Some(sampled) = self.subgraph_sampled { + assert_eq!(subgraph_span_context.is_sampled(), sampled); + } + } Ok(()) } diff --git a/apollo-router/tests/integration/telemetry/otlp.rs b/apollo-router/tests/integration/telemetry/otlp.rs index 0ba9178cec..928eae1a4f 100644 --- a/apollo-router/tests/integration/telemetry/otlp.rs +++ b/apollo-router/tests/integration/telemetry/otlp.rs @@ -2,9 +2,12 @@ extern crate core; use std::collections::HashMap; use std::collections::HashSet; +use std::sync::Arc; +use std::sync::Mutex; use std::time::Duration; use anyhow::anyhow; +use opentelemetry_api::trace::SpanContext; use opentelemetry_api::trace::TraceId; use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceResponse; use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceResponse; @@ -67,6 +70,7 @@ async fn test_basic() -> Result<(), BoxError> { ] .into(), ) + .subgraph_sampled(true) .build() .validate_trace(id, &mock_server) .await?; @@ -106,6 +110,7 @@ async fn test_otlp_request_with_datadog_propagator() -> Result<(), BoxError> { Spec::builder() .services(["client", "router", "subgraph"].into()) .priority_sampled("1") + .subgraph_sampled(true) .build() .validate_trace(id, &mock_server) .await?; @@ -136,6 +141,7 @@ async fn test_otlp_request_with_datadog_propagator_no_agent() -> Result<(), BoxE let (id, _) = router.execute_query(&query).await; Spec::builder() .services(["client", "router", "subgraph"].into()) + .subgraph_sampled(true) .build() .validate_trace(id, &mock_server) .await?; @@ -169,6 +175,7 @@ async fn test_otlp_request_with_zipkin_trace_context_propagator_with_datadog( Spec::builder() .services(["client", "router", "subgraph"].into()) .priority_sampled("1") + .subgraph_sampled(true) .build() .validate_trace(id, &mock_server) .await?; @@ -194,6 +201,7 @@ async fn test_otlp_request_with_zipkin_trace_context_propagator_with_datadog( Spec::builder() .services(["router"].into()) .priority_sampled("0") + .subgraph_sampled(true) .build() .validate_trace(id, &mock_server) .await?; @@ -214,6 +222,7 @@ async fn test_otlp_request_with_zipkin_trace_context_propagator_with_datadog( Spec::builder() .services(["router", "subgraph"].into()) .priority_sampled("1") + .subgraph_sampled(true) .build() .validate_trace(id, &mock_server) .await?; @@ -254,6 +263,7 @@ async fn test_otlp_request_with_zipkin_trace_context_propagator_with_datadog( Spec::builder() .services(["router"].into()) .priority_sampled("1") + .subgraph_sampled(true) .build() .validate_trace(id, &mock_server) .await?; @@ -281,6 +291,7 @@ async fn test_untraced_request_no_sample_datadog_agent() -> Result<(), BoxError> Spec::builder() .services(["router"].into()) .priority_sampled("0") + .subgraph_sampled(false) .build() .validate_trace(id, &mock_server) .await?; @@ -306,6 +317,7 @@ async fn test_untraced_request_sample_datadog_agent() -> Result<(), BoxError> { Spec::builder() .services(["router"].into()) .priority_sampled("1") + .subgraph_sampled(true) .build() .validate_trace(id, &mock_server) .await?; @@ -352,13 +364,18 @@ async fn test_priority_sampling_propagated() -> Result<(), BoxError> { let mock_server = mock_otlp_server().await; let config = include_str!("fixtures/otlp_datadog_propagation.router.yaml") .replace("", &mock_server.uri()); + let context = Arc::new(Mutex::new(None)); let mut router = IntegrationTest::builder() // We're using datadog propagation as this is what we are trying to test. - .telemetry(Telemetry::Datadog) + .telemetry(Telemetry::Otlp { + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), + }) + .extra_propagator(Telemetry::Datadog) .config(config) .responder(ResponseTemplate::new(200).set_body_json( json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), )) + .subgraph_context(context.clone()) .build() .await; @@ -367,41 +384,49 @@ async fn test_priority_sampling_propagated() -> Result<(), BoxError> { // Parent based sampling. psr MUST be populated with the value that we pass in. test_psr( + &context, &mut router, Some("-1"), Spec::builder() - .services(["router"].into()) + .services(["client", "router"].into()) .priority_sampled("-1") + .subgraph_sampled(false) .build(), &mock_server, ) .await?; test_psr( + &context, &mut router, Some("0"), Spec::builder() - .services(["router"].into()) + .services(["client", "router"].into()) .priority_sampled("0") + .subgraph_sampled(false) .build(), &mock_server, ) .await?; test_psr( + &context, &mut router, Some("1"), Spec::builder() - .services(["router"].into()) + .services(["client", "router", "subgraph"].into()) .priority_sampled("1") + .subgraph_sampled(true) .build(), &mock_server, ) .await?; test_psr( + &context, &mut router, Some("2"), Spec::builder() - .services(["router"].into()) + .services(["client", "router", "subgraph"].into()) .priority_sampled("2") + .subgraph_sampled(true) .build(), &mock_server, ) @@ -409,11 +434,13 @@ async fn test_priority_sampling_propagated() -> Result<(), BoxError> { // No psr was passed in the router is free to set it. This will be 1 as we are going to sample here. test_psr( + &context, &mut router, None, Spec::builder() - .services(["router"].into()) + .services(["client", "router", "subgraph"].into()) .priority_sampled("1") + .subgraph_sampled(true) .build(), &mock_server, ) @@ -432,12 +459,17 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { let mock_server = mock_otlp_server().await; let config = include_str!("fixtures/otlp_datadog_propagation_no_parent_sampler.router.yaml") .replace("", &mock_server.uri()); + let context = Arc::new(Mutex::new(None)); let mut router = IntegrationTest::builder() - .telemetry(Telemetry::Datadog) + .telemetry(Telemetry::Otlp { + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), + }) + .extra_propagator(Telemetry::Datadog) .config(config) .responder(ResponseTemplate::new(200).set_body_json( json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), )) + .subgraph_context(context.clone()) .build() .await; @@ -446,52 +478,62 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { // The router will ignore the upstream PSR as parent based sampling is disabled. test_psr( + &context, &mut router, Some("-1"), Spec::builder() - .services(["router"].into()) + .services(["client", "router", "subgraph"].into()) .priority_sampled("1") + .subgraph_sampled(true) .build(), &mock_server, ) .await?; test_psr( + &context, &mut router, Some("0"), Spec::builder() - .services(["router"].into()) + .services(["client", "router", "subgraph"].into()) .priority_sampled("1") + .subgraph_sampled(true) .build(), &mock_server, ) .await?; test_psr( + &context, &mut router, Some("1"), Spec::builder() - .services(["router"].into()) + .services(["client", "router", "subgraph"].into()) .priority_sampled("1") + .subgraph_sampled(true) .build(), &mock_server, ) .await?; test_psr( + &context, &mut router, Some("2"), Spec::builder() - .services(["router"].into()) + .services(["client", "router", "subgraph"].into()) .priority_sampled("1") + .subgraph_sampled(true) .build(), &mock_server, ) .await?; test_psr( + &context, &mut router, None, Spec::builder() - .services(["router"].into()) + .services(["client", "router", "subgraph"].into()) .priority_sampled("1") + .subgraph_sampled(true) .build(), &mock_server, ) @@ -503,6 +545,7 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { } async fn test_psr( + context: &Arc>>, router: &mut IntegrationTest, psr: Option<&str>, trace_spec: Spec, @@ -519,12 +562,14 @@ async fn test_psr( .await; assert!(result.status().is_success()); + trace_spec.validate_trace(id, mock_server).await?; Ok(()) } #[derive(buildstructor::Builder)] struct Spec { + subgraph_context: Option>>>, operation_name: Option, version: Option, services: HashSet<&'static str>, @@ -532,6 +577,7 @@ struct Spec { measured_spans: HashSet<&'static str>, unmeasured_spans: HashSet<&'static str>, priority_sampled: Option<&'static str>, + subgraph_sampled: Option, } impl Spec { @@ -544,6 +590,20 @@ impl Spec { tokio::time::sleep(Duration::from_millis(100)).await; } self.find_valid_trace(id, mock_server).await?; + + if let Some(subgraph_context) = &self.subgraph_context { + let subgraph_context = subgraph_context.lock().expect("poisoned"); + let subgraph_span_context = subgraph_context.as_ref().expect("state").clone(); + + assert_eq!( + subgraph_span_context.trace_state().get("psr"), + self.priority_sampled + ); + if let Some(sampled) = self.subgraph_sampled { + assert_eq!(subgraph_span_context.is_sampled(), sampled); + } + } + Ok(()) } From 4490216c0b8b6b1aab9e1ded06c807e822984edb Mon Sep 17 00:00:00 2001 From: bryn Date: Tue, 10 Dec 2024 15:30:19 +0000 Subject: [PATCH 14/26] Lint and test fixes --- .../tests/integration/telemetry/otlp.rs | 47 ++++++++++++++----- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/apollo-router/tests/integration/telemetry/otlp.rs b/apollo-router/tests/integration/telemetry/otlp.rs index 928eae1a4f..59dc15a518 100644 --- a/apollo-router/tests/integration/telemetry/otlp.rs +++ b/apollo-router/tests/integration/telemetry/otlp.rs @@ -158,10 +158,12 @@ async fn test_otlp_request_with_zipkin_trace_context_propagator_with_datadog( let mock_server = mock_otlp_server().await; let config = include_str!("fixtures/otlp_datadog_request_with_zipkin_propagator.router.yaml") .replace("", &mock_server.uri()); + let context = Arc::new(Mutex::new(None)); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Otlp { endpoint: Some(format!("{}/v1/traces", mock_server.uri())), }) + .subgraph_context(context.clone()) .config(&config) .build() .await; @@ -173,6 +175,7 @@ async fn test_otlp_request_with_zipkin_trace_context_propagator_with_datadog( let (id, _) = router.execute_query(&query).await; Spec::builder() + .subgraph_context(context.clone()) .services(["client", "router", "subgraph"].into()) .priority_sampled("1") .subgraph_sampled(true) @@ -199,6 +202,7 @@ async fn test_otlp_request_with_zipkin_trace_context_propagator_with_datadog( let (_id, _) = router.execute_untraced_query(&query, Some(headers)).await; Spec::builder() + .subgraph_context(context.clone()) .services(["router"].into()) .priority_sampled("0") .subgraph_sampled(true) @@ -220,6 +224,7 @@ async fn test_otlp_request_with_zipkin_trace_context_propagator_with_datadog( let (_id, _) = router.execute_untraced_query(&query, Some(headers)).await; Spec::builder() + .subgraph_context(context.clone()) .services(["router", "subgraph"].into()) .priority_sampled("1") .subgraph_sampled(true) @@ -241,6 +246,7 @@ async fn test_otlp_request_with_zipkin_trace_context_propagator_with_datadog( let (_id, _) = router.execute_untraced_query(&query, Some(headers)).await; Spec::builder() + .subgraph_context(context.clone()) .services(["router"].into()) .priority_sampled("0") .build() @@ -261,6 +267,7 @@ async fn test_otlp_request_with_zipkin_trace_context_propagator_with_datadog( let (_id, _) = router.execute_untraced_query(&query, Some(headers)).await; Spec::builder() + .subgraph_context(context.clone()) .services(["router"].into()) .priority_sampled("1") .subgraph_sampled(true) @@ -281,7 +288,12 @@ async fn test_untraced_request_no_sample_datadog_agent() -> Result<(), BoxError> let mock_server = mock_otlp_server().await; let config = include_str!("fixtures/otlp_datadog_agent_no_sample.router.yaml") .replace("", &mock_server.uri()); - let mut router = IntegrationTest::builder().config(&config).build().await; + let context = Arc::new(Mutex::new(None)); + let mut router = IntegrationTest::builder() + .subgraph_context(context.clone()) + .config(&config) + .build() + .await; router.start().await; router.assert_started().await; @@ -289,6 +301,7 @@ async fn test_untraced_request_no_sample_datadog_agent() -> Result<(), BoxError> let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); let (id, _) = router.execute_untraced_query(&query, None).await; Spec::builder() + .subgraph_context(context) .services(["router"].into()) .priority_sampled("0") .subgraph_sampled(false) @@ -307,7 +320,12 @@ async fn test_untraced_request_sample_datadog_agent() -> Result<(), BoxError> { let mock_server = mock_otlp_server().await; let config = include_str!("fixtures/otlp_datadog_agent_sample.router.yaml") .replace("", &mock_server.uri()); - let mut router = IntegrationTest::builder().config(&config).build().await; + let context = Arc::new(Mutex::new(None)); + let mut router = IntegrationTest::builder() + .subgraph_context(context.clone()) + .config(&config) + .build() + .await; router.start().await; router.assert_started().await; @@ -315,6 +333,7 @@ async fn test_untraced_request_sample_datadog_agent() -> Result<(), BoxError> { let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); let (id, _) = router.execute_untraced_query(&query, None).await; Spec::builder() + .subgraph_context(context.clone()) .services(["router"].into()) .priority_sampled("1") .subgraph_sampled(true) @@ -331,12 +350,14 @@ async fn test_untraced_request_sample_datadog_agent_unsampled() -> Result<(), Bo panic!("Error: test skipped because GraphOS is not enabled"); } let mock_server = mock_otlp_server().await; + let context = Arc::new(Mutex::new(None)); let config = include_str!("fixtures/otlp_datadog_agent_sample_no_sample.router.yaml") .replace("", &mock_server.uri()); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Otlp { endpoint: Some(format!("{}/v1/traces", mock_server.uri())), }) + .subgraph_context(context.clone()) .config(&config) .build() .await; @@ -349,6 +370,8 @@ async fn test_untraced_request_sample_datadog_agent_unsampled() -> Result<(), Bo Spec::builder() .services(["router"].into()) .priority_sampled("0") + .subgraph_sampled(false) + .subgraph_context(context.clone()) .build() .validate_trace(id, &mock_server) .await?; @@ -384,10 +407,10 @@ async fn test_priority_sampling_propagated() -> Result<(), BoxError> { // Parent based sampling. psr MUST be populated with the value that we pass in. test_psr( - &context, &mut router, Some("-1"), Spec::builder() + .subgraph_context(context.clone()) .services(["client", "router"].into()) .priority_sampled("-1") .subgraph_sampled(false) @@ -396,10 +419,10 @@ async fn test_priority_sampling_propagated() -> Result<(), BoxError> { ) .await?; test_psr( - &context, &mut router, Some("0"), Spec::builder() + .subgraph_context(context.clone()) .services(["client", "router"].into()) .priority_sampled("0") .subgraph_sampled(false) @@ -408,7 +431,6 @@ async fn test_priority_sampling_propagated() -> Result<(), BoxError> { ) .await?; test_psr( - &context, &mut router, Some("1"), Spec::builder() @@ -420,10 +442,10 @@ async fn test_priority_sampling_propagated() -> Result<(), BoxError> { ) .await?; test_psr( - &context, &mut router, Some("2"), Spec::builder() + .subgraph_context(context.clone()) .services(["client", "router", "subgraph"].into()) .priority_sampled("2") .subgraph_sampled(true) @@ -434,10 +456,10 @@ async fn test_priority_sampling_propagated() -> Result<(), BoxError> { // No psr was passed in the router is free to set it. This will be 1 as we are going to sample here. test_psr( - &context, &mut router, None, Spec::builder() + .subgraph_context(context.clone()) .services(["client", "router", "subgraph"].into()) .priority_sampled("1") .subgraph_sampled(true) @@ -478,10 +500,10 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { // The router will ignore the upstream PSR as parent based sampling is disabled. test_psr( - &context, &mut router, Some("-1"), Spec::builder() + .subgraph_context(context.clone()) .services(["client", "router", "subgraph"].into()) .priority_sampled("1") .subgraph_sampled(true) @@ -490,10 +512,10 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { ) .await?; test_psr( - &context, &mut router, Some("0"), Spec::builder() + .subgraph_context(context.clone()) .services(["client", "router", "subgraph"].into()) .priority_sampled("1") .subgraph_sampled(true) @@ -502,10 +524,10 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { ) .await?; test_psr( - &context, &mut router, Some("1"), Spec::builder() + .subgraph_context(context.clone()) .services(["client", "router", "subgraph"].into()) .priority_sampled("1") .subgraph_sampled(true) @@ -514,10 +536,10 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { ) .await?; test_psr( - &context, &mut router, Some("2"), Spec::builder() + .subgraph_context(context.clone()) .services(["client", "router", "subgraph"].into()) .priority_sampled("1") .subgraph_sampled(true) @@ -527,10 +549,10 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { .await?; test_psr( - &context, &mut router, None, Spec::builder() + .subgraph_context(context.clone()) .services(["client", "router", "subgraph"].into()) .priority_sampled("1") .subgraph_sampled(true) @@ -545,7 +567,6 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { } async fn test_psr( - context: &Arc>>, router: &mut IntegrationTest, psr: Option<&str>, trace_spec: Spec, From bfeca6e231c8f987a6204da59fb0e89e31842284 Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 13 Dec 2024 10:44:41 +0000 Subject: [PATCH 15/26] Improve integration tester --- apollo-router/tests/common.rs | 257 ++++--- apollo-router/tests/integration/batching.rs | 4 +- .../tests/integration/coprocessor.rs | 4 +- .../tests/integration/introspection.rs | 8 +- .../tests/integration/operation_limits.rs | 6 +- .../query_planner/max_evaluated_plans.rs | 14 +- .../tests/integration/subgraph_response.rs | 22 +- .../tests/integration/subscription.rs | 5 +- apollo-router/tests/integration/supergraph.rs | 13 +- .../tests/integration/telemetry/datadog.rs | 616 ++++++--------- ...adog_parent_sampler_very_small.router.yaml | 29 + ...request_with_zipkin_propagator.router.yaml | 1 + .../tests/integration/telemetry/jaeger.rs | 639 ++++++---------- .../tests/integration/telemetry/logging.rs | 79 +- .../tests/integration/telemetry/metrics.rs | 26 +- .../tests/integration/telemetry/mod.rs | 18 + .../tests/integration/telemetry/otlp.rs | 706 ++++++++---------- .../integration/telemetry/propagation.rs | 5 +- .../tests/integration/telemetry/verifier.rs | 160 ++++ .../tests/integration/telemetry/zipkin.rs | 211 +++--- .../tests/integration/traffic_shaping.rs | 6 +- apollo-router/tests/samples_tests.rs | 3 +- 22 files changed, 1309 insertions(+), 1523 deletions(-) create mode 100644 apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small.router.yaml create mode 100644 apollo-router/tests/integration/telemetry/verifier.rs diff --git a/apollo-router/tests/common.rs b/apollo-router/tests/common.rs index 2fa1d8a024..5374059ccf 100644 --- a/apollo-router/tests/common.rs +++ b/apollo-router/tests/common.rs @@ -8,7 +8,7 @@ use std::sync::Arc; use std::sync::Mutex; use std::time::Duration; -use buildstructor::buildstructor; +use buildstructor::{buildstructor}; use fred::clients::RedisClient; use fred::interfaces::ClientLike; use fred::interfaces::KeysInterface; @@ -18,7 +18,6 @@ use fred::types::Scanner; use futures::StreamExt; use http::header::ACCEPT; use http::header::CONTENT_TYPE; -use http::HeaderValue; use mediatype::names::BOUNDARY; use mediatype::names::FORM_DATA; use mediatype::names::MULTIPART; @@ -70,6 +69,65 @@ use wiremock::Mock; use wiremock::Respond; use wiremock::ResponseTemplate; +pub struct Query { + traced: bool, + psr: Option<&'static str>, + headers: HashMap, + content_type: String, + body: Value, +} + +impl Default for Query { + fn default() -> Self { + Query::builder().build() + } +} + +#[buildstructor::buildstructor] +impl Query { + #[builder] + pub fn new( + traced: Option, + psr: Option<&'static str>, + body: Option, + content_type: Option, + headers: HashMap, + ) -> Self { + Self { + traced: traced.unwrap_or(true), + psr, + body: body.unwrap_or( + json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}), + ), + content_type: content_type + .unwrap_or_else(|| APPLICATION_JSON.essence_str().to_string()), + headers, + } + } +} +impl Query { + pub fn with_bad_content_type(mut self) -> Self { + self.content_type = "garbage".to_string(); + self + } + + pub fn with_bad_query(mut self) -> Self { + self.body = json!({"garbage":{}}); + self + } + + pub fn with_huge_query(mut self) -> Self { + self.body = json!({"query":"query {topProducts{name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name}}","variables":{}}); + self + } + + pub fn introspection() -> Query { + Query::builder() + .body(json!({"query":"{__schema {types {name}}}","variables":{}})) + .build() + } +} + pub struct IntegrationTest { router: Option, test_config_location: PathBuf, @@ -90,6 +148,7 @@ pub struct IntegrationTest { bind_address: Arc>>, redis_namespace: String, log: String, + subgraph_context: Arc>>, } impl IntegrationTest { @@ -104,6 +163,7 @@ impl IntegrationTest { struct TracedResponder { response_template: ResponseTemplate, telemetry: Telemetry, + extra_propagator: Telemetry, subscriber_subgraph: Dispatch, subgraph_callback: Option>, subgraph_context: Arc>>, @@ -111,10 +171,11 @@ struct TracedResponder { impl Respond for TracedResponder { fn respond(&self, request: &wiremock::Request) -> ResponseTemplate { - let context = self.telemetry.extract_context(request); + let context = self.telemetry.extract_context(request, &Context::new()); + let context = self.extra_propagator.extract_context(request, &context); + *self.subgraph_context.lock().expect("lock poisoned") = Some(context.span().span_context().clone()); - tracing_core::dispatcher::with_default(&self.subscriber_subgraph, || { let _context_guard = context.attach(); let span = info_span!("subgraph server"); @@ -264,7 +325,7 @@ impl Telemetry { } } - pub(crate) fn extract_context(&self, request: &wiremock::Request) -> Context { + pub(crate) fn extract_context(&self, request: &wiremock::Request, context: &Context) -> Context { let headers: HashMap = request .headers .iter() @@ -274,11 +335,13 @@ impl Telemetry { match self { Telemetry::Jaeger => { let propagator = opentelemetry_jaeger::Propagator::new(); - propagator.extract(&headers) + propagator.extract_with_context(context, &headers) } Telemetry::Datadog => { + let span_ref = context.span(); + let original_span_context = span_ref.span_context(); let propagator = opentelemetry_datadog::DatadogPropagator::new(); - let mut context = propagator.extract(&headers); + let mut context = propagator.extract_with_context(context, &headers); // We're going to override the sampled so that we can test sampling priority if let Some(psr) = headers.get("x-datadog-sampling-priority") { let state = context @@ -287,8 +350,14 @@ impl Telemetry { .trace_state() .insert("psr", psr.to_string()) .expect("psr"); + let new_trace_id = if original_span_context.is_valid() { + original_span_context.trace_id() + } + else { + context.span().span_context().trace_id() + }; context = context.with_remote_span_context(SpanContext::new( - context.span().span_context().trace_id(), + new_trace_id, context.span().span_context().span_id(), context.span().span_context().trace_flags(), true, @@ -300,13 +369,13 @@ impl Telemetry { } Telemetry::Otlp { .. } => { let propagator = opentelemetry::sdk::propagation::TraceContextPropagator::default(); - propagator.extract(&headers) + propagator.extract_with_context(context, &headers) } Telemetry::Zipkin => { let propagator = opentelemetry_zipkin::Propagator::new(); - propagator.extract(&headers) + propagator.extract_with_context(context, &headers) } - _ => Context::current(), + _ => context.clone(), } } } @@ -319,7 +388,6 @@ impl IntegrationTest { telemetry: Option, extra_propagator: Option, responder: Option, - subgraph_context: Option>>>, collect_stdio: Option>, supergraph: Option, mut subgraph_overrides: HashMap, @@ -354,13 +422,15 @@ impl IntegrationTest { .start() .await; + let subgraph_context = Arc::new(Mutex::new(None)); Mock::given(method("POST")) .respond_with(TracedResponder{response_template:responder.unwrap_or_else(|| ResponseTemplate::new(200).set_body_json(json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}))), telemetry: telemetry.clone(), + extra_propagator: extra_propagator.clone(), subscriber_subgraph: Self::dispatch(&tracer_provider_subgraph), subgraph_callback, - subgraph_context: subgraph_context.unwrap_or_default() + subgraph_context: subgraph_context.clone() }) .mount(&subgraphs) .await; @@ -398,6 +468,7 @@ impl IntegrationTest { extra_propagator, redis_namespace, log: log.unwrap_or_else(|| "error,apollo_router=info".to_owned()), + subgraph_context, } } @@ -415,6 +486,15 @@ impl IntegrationTest { Dispatch::new(subscriber) } + pub fn subgraph_context(&self) -> SpanContext { + self.subgraph_context + .lock() + .expect("lock poisoned") + .as_ref() + .unwrap() + .clone() + } + pub fn router_location() -> PathBuf { PathBuf::from(env!("CARGO_BIN_EXE_router")) } @@ -541,60 +621,15 @@ impl IntegrationTest { fs::copy(supergraph_path, &self.test_schema_location).expect("could not write schema"); } - #[allow(dead_code)] pub fn execute_default_query( &self, ) -> impl std::future::Future { - self.execute_query_internal( - &json!({"query":"query {topProducts{name}}","variables":{}}), - None, - None, - ) + self.execute_query(Query::builder().build()) } - #[allow(dead_code)] pub fn execute_query( &self, - query: &Value, - ) -> impl std::future::Future { - self.execute_query_internal(query, None, None) - } - - #[allow(dead_code)] - pub fn execute_bad_query( - &self, - ) -> impl std::future::Future { - self.execute_query_internal(&json!({"garbage":{}}), None, None) - } - - #[allow(dead_code)] - pub fn execute_huge_query( - &self, - ) -> impl std::future::Future { - self.execute_query_internal(&json!({"query":"query {topProducts{name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name}}","variables":{}}), None, None) - } - - #[allow(dead_code)] - pub fn execute_bad_content_type( - &self, - ) -> impl std::future::Future { - self.execute_query_internal(&json!({"garbage":{}}), Some("garbage"), None) - } - - #[allow(dead_code)] - pub fn execute_query_with_headers( - &self, - query: &Value, - headers: HashMap, - ) -> impl std::future::Future { - self.execute_query_internal(query, None, Some(headers)) - } - - fn execute_query_internal( - &self, - query: &Value, - content_type: Option<&'static str>, - headers: Option>, + query: Query, ) -> impl std::future::Future { assert!( self.router.is_some(), @@ -603,37 +638,46 @@ impl IntegrationTest { let telemetry = self.telemetry.clone(); let extra_propagator = self.extra_propagator.clone(); - let query = query.clone(); let url = format!("http://{}", self.bind_address()); - + let subgraph_context = self.subgraph_context.clone(); async move { let span = info_span!("client_request"); - let span_id = span.context().span().span_context().trace_id(); + let trace_id = span.context().span().span_context().trace_id(); async move { let client = reqwest::Client::new(); - let mut builder = client - .post(url) - .header( - CONTENT_TYPE, - content_type.unwrap_or(APPLICATION_JSON.essence_str()), - ) - .header("apollographql-client-name", "custom_name") - .header("apollographql-client-version", "1.0") - .header("x-my-header", "test") - .header("head", "test"); + let mut builder = client.post(url).header(CONTENT_TYPE, query.content_type); - if let Some(headers) = headers { - for (name, value) in headers { - builder = builder.header(name, value); - } + for (name, value) in query.headers { + builder = builder.header(name, value); + } + + if let Some(psr) = query.psr { + builder = builder.header("x-datadog-sampling-priority", psr); + } + + let mut request = builder.json(&query.body).build().unwrap(); + if query.traced { + telemetry.inject_context(&mut request); + extra_propagator.inject_context(&mut request); } - let mut request = builder.json(&query).build().unwrap(); - telemetry.inject_context(&mut request); - extra_propagator.inject_context(&mut request); match client.execute(request).await { - Ok(response) => (span_id, response), + Ok(response) => { + if query.traced { + (trace_id, response) + } else { + ( + subgraph_context + .lock() + .expect("poisoned") + .as_ref() + .expect("subgraph context") + .trace_id(), + response, + ) + } + } Err(err) => { panic!("unable to send successful request to router, {err}") } @@ -645,57 +689,6 @@ impl IntegrationTest { .with_subscriber(self.subscriber_client.clone()) } - #[allow(dead_code)] - pub fn execute_untraced_query( - &self, - query: &Value, - headers: Option>, - ) -> impl std::future::Future { - assert!( - self.router.is_some(), - "router was not started, call `router.start().await; router.assert_started().await`" - ); - let query = query.clone(); - let url = format!("http://{}", self.bind_address()); - - async move { - let client = reqwest::Client::new(); - - let mut builder = client - .post(url) - .header(CONTENT_TYPE, APPLICATION_JSON.essence_str()) - .header("apollographql-client-name", "custom_name") - .header("apollographql-client-version", "1.0") - .json(&query); - - if let Some(headers) = headers { - for (name, value) in headers { - builder = builder.header(name, value); - } - } - - match client.execute(builder.build().unwrap()).await { - Ok(response) => ( - TraceId::from_hex( - response - .headers() - .get("apollo-custom-trace-id") - .cloned() - .unwrap_or(HeaderValue::from_static("no-trace-id")) - .to_str() - .unwrap_or_default(), - ) - .unwrap_or(TraceId::INVALID), - response, - ), - Err(err) => { - panic!("unable to send successful request to router, {err}") - } - } - } - .with_subscriber(self.subscriber_client.clone()) - } - /// Make a raw multipart request to the router. #[allow(dead_code)] pub fn execute_multipart_request( diff --git a/apollo-router/tests/integration/batching.rs b/apollo-router/tests/integration/batching.rs index 15dfd38de2..f9e7ba6ab8 100644 --- a/apollo-router/tests/integration/batching.rs +++ b/apollo-router/tests/integration/batching.rs @@ -856,7 +856,7 @@ mod helper { use wiremock::ResponseTemplate; use super::test_is_enabled; - use crate::integration::common::IntegrationTest; + use crate::integration::common::{IntegrationTest, Query}; /// Helper type for specifying a valid handler pub type Handler = fn(&wiremock::Request) -> ResponseTemplate; @@ -916,7 +916,7 @@ mod helper { // Execute the request let request = serde_json::to_value(requests)?; - let (_span, response) = router.execute_query(&request).await; + let (_span, response) = router.execute_query(Query::builder().body(request).build()).await; serde_json::from_slice::>(&response.bytes().await?).map_err(BoxError::from) } diff --git a/apollo-router/tests/integration/coprocessor.rs b/apollo-router/tests/integration/coprocessor.rs index d9ce741892..21c5f0db2b 100644 --- a/apollo-router/tests/integration/coprocessor.rs +++ b/apollo-router/tests/integration/coprocessor.rs @@ -7,7 +7,7 @@ use wiremock::matchers::path; use wiremock::Mock; use wiremock::ResponseTemplate; -use crate::integration::common::graph_os_enabled; +use crate::integration::common::{graph_os_enabled, Query}; use crate::integration::IntegrationTest; #[tokio::test(flavor = "multi_thread")] @@ -75,7 +75,7 @@ async fn test_coprocessor_limit_payload() -> Result<(), BoxError> { assert_eq!(response.status(), 200); // This query is huge and will be rejected because it is too large before hitting the coprocessor - let (_trace_id, response) = router.execute_huge_query().await; + let (_trace_id, response) = router.execute_query(Query::default().with_huge_query()).await; assert_eq!(response.status(), 413); assert_yaml_snapshot!(response.text().await?); diff --git a/apollo-router/tests/integration/introspection.rs b/apollo-router/tests/integration/introspection.rs index 95c8ad9c8c..56b2a496cd 100644 --- a/apollo-router/tests/integration/introspection.rs +++ b/apollo-router/tests/integration/introspection.rs @@ -1,10 +1,10 @@ +use crate::integration::common::Query; +use crate::integration::IntegrationTest; use apollo_router::plugin::test::MockSubgraph; use apollo_router::services::supergraph::Request; use serde_json::json; use tower::ServiceExt; -use crate::integration::IntegrationTest; - #[tokio::test] async fn simple() { let request = Request::fake_builder() @@ -226,7 +226,9 @@ async fn integration() { let query = json!({ "query": include_str!("../fixtures/introspect_full_schema.graphql"), }); - let (_trace_id, response) = router.execute_query(&query).await; + let (_trace_id, response) = router + .execute_query(Query::builder().body(query).build()) + .await; insta::assert_json_snapshot!(response.json::().await.unwrap()); router.graceful_shutdown().await; } diff --git a/apollo-router/tests/integration/operation_limits.rs b/apollo-router/tests/integration/operation_limits.rs index 79ad7d9f89..b0c5b25802 100644 --- a/apollo-router/tests/integration/operation_limits.rs +++ b/apollo-router/tests/integration/operation_limits.rs @@ -9,7 +9,7 @@ use apollo_router::TestHarness; use serde_json::json; use tower::BoxError; use tower::ServiceExt; - +use crate::integration::common::Query; use crate::integration::IntegrationTest; #[tokio::test(flavor = "multi_thread")] @@ -310,7 +310,7 @@ async fn test_request_bytes_limit_with_coprocessor() -> Result<(), BoxError> { .await; router.start().await; router.assert_started().await; - let (_, resp) = router.execute_huge_query().await; + let (_, resp) = router.execute_query(Query::default().with_huge_query()).await; assert_eq!(resp.status(), 413); router.graceful_shutdown().await; Ok(()) @@ -324,7 +324,7 @@ async fn test_request_bytes_limit() -> Result<(), BoxError> { .await; router.start().await; router.assert_started().await; - let (_, resp) = router.execute_huge_query().await; + let (_, resp) = router.execute_query(Query::default().with_huge_query()).await; assert_eq!(resp.status(), 413); router.graceful_shutdown().await; Ok(()) diff --git a/apollo-router/tests/integration/query_planner/max_evaluated_plans.rs b/apollo-router/tests/integration/query_planner/max_evaluated_plans.rs index d6474aa30b..dc7800feea 100644 --- a/apollo-router/tests/integration/query_planner/max_evaluated_plans.rs +++ b/apollo-router/tests/integration/query_planner/max_evaluated_plans.rs @@ -1,5 +1,5 @@ use serde_json::json; - +use crate::integration::common::Query; use crate::integration::IntegrationTest; fn assert_evaluated_plans(prom: &str, expected: u64) { @@ -31,10 +31,10 @@ async fn reports_evaluated_plans() { router.start().await; router.assert_started().await; router - .execute_query(&json!({ + .execute_query(Query::builder().body(json!({ "query": r#"{ t { v1 v2 v3 v4 } }"#, "variables": {}, - })) + })).build()) .await; let metrics = router @@ -71,10 +71,10 @@ async fn does_not_exceed_max_evaluated_plans_legacy() { router.start().await; router.assert_started().await; router - .execute_query(&json!({ + .execute_query(Query::builder().body(json!({ "query": r#"{ t { v1 v2 v3 v4 } }"#, "variables": {}, - })) + })).build()) .await; let metrics = router @@ -111,10 +111,10 @@ async fn does_not_exceed_max_evaluated_plans() { router.start().await; router.assert_started().await; router - .execute_query(&json!({ + .execute_query(Query::builder().body(json!({ "query": r#"{ t { v1 v2 v3 v4 } }"#, "variables": {}, - })) + })).build()) .await; let metrics = router diff --git a/apollo-router/tests/integration/subgraph_response.rs b/apollo-router/tests/integration/subgraph_response.rs index e37a0da067..802bfc2de4 100644 --- a/apollo-router/tests/integration/subgraph_response.rs +++ b/apollo-router/tests/integration/subgraph_response.rs @@ -1,7 +1,7 @@ use serde_json::json; use tower::BoxError; use wiremock::ResponseTemplate; - +use crate::integration::common::Query; use crate::integration::IntegrationTest; const CONFIG: &str = r#" @@ -21,7 +21,7 @@ async fn test_subgraph_returning_data_null() -> Result<(), BoxError> { router.assert_started().await; let query = "{ __typename topProducts { name } }"; - let (_trace_id, response) = router.execute_query(&json!({ "query": query })).await; + let (_trace_id, response) = router.execute_query(Query::builder().body(json!({ "query": query })).build()).await; assert_eq!(response.status(), 200); assert_eq!( response.json::().await?, @@ -64,7 +64,7 @@ async fn test_subgraph_returning_different_typename_on_query_root() -> Result<() inside_fragment: __typename } "#; - let (_trace_id, response) = router.execute_query(&json!({ "query": query })).await; + let (_trace_id, response) = router.execute_query(Query::builder().body(json!({ "query": query })).build()).await; assert_eq!(response.status(), 200); assert_eq!( response.json::().await?, @@ -99,7 +99,7 @@ async fn test_valid_extensions_service_for_subgraph_error() -> Result<(), BoxErr router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) + .execute_query(Query::builder().body(json!({ "query": "{ topProducts { name } }" })).build()) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -141,7 +141,7 @@ async fn test_valid_extensions_service_is_preserved_for_subgraph_error() -> Resu router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) + .execute_query(Query::builder().body(json!({ "query": "{ topProducts { name } }" })).build()) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -174,7 +174,7 @@ async fn test_valid_extensions_service_for_invalid_subgraph_response() -> Result router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) + .execute_query(Query::builder().body(json!({ "query": "{ topProducts { name } }" })).build()) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -222,7 +222,7 @@ async fn test_valid_error_locations() -> Result<(), BoxError> { router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) + .execute_query(Query::builder().body(json!({ "query": "{ topProducts { name } }" })).build()) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -264,7 +264,7 @@ async fn test_empty_error_locations() -> Result<(), BoxError> { router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) + .execute_query(Query::builder().body(json!({ "query": "{ topProducts { name } }" })).build()) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -302,7 +302,7 @@ async fn test_invalid_error_locations() -> Result<(), BoxError> { router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) + .execute_query(Query::builder().body(json!({ "query": "{ topProducts { name } }" })).build()) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -345,7 +345,7 @@ async fn test_invalid_error_locations_with_single_negative_one_location() -> Res router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) + .execute_query(Query::builder().body(json!({ "query": "{ topProducts { name } }" })).build()) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -387,7 +387,7 @@ async fn test_invalid_error_locations_contains_negative_one_location() -> Result router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ "query": "{ topProducts { name } }" })) + .execute_query(Query::builder().body(json!({ "query": "{ topProducts { name } }" })).build()) .await; assert_eq!(response.status(), 200); assert_eq!( diff --git a/apollo-router/tests/integration/subscription.rs b/apollo-router/tests/integration/subscription.rs index 911503593f..74c42f2034 100644 --- a/apollo-router/tests/integration/subscription.rs +++ b/apollo-router/tests/integration/subscription.rs @@ -4,7 +4,7 @@ use http::HeaderValue; use serde_json::json; use tower::BoxError; -use super::common::IntegrationTest; +use super::common::{IntegrationTest, Query}; use super::common::Telemetry; const SUBSCRIPTION_CONFIG: &str = include_str!("../fixtures/subscription.router.yaml"); @@ -59,8 +59,7 @@ async fn test_subscription_load() -> Result<(), BoxError> { for _ in 0..100 { let (_id, resp) = router - .execute_query( - &json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}), + .execute_query(Query::builder().body(json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}})).build(), ) .await; assert!(resp.status().is_success()); diff --git a/apollo-router/tests/integration/supergraph.rs b/apollo-router/tests/integration/supergraph.rs index 97d5131d84..5732b0921d 100644 --- a/apollo-router/tests/integration/supergraph.rs +++ b/apollo-router/tests/integration/supergraph.rs @@ -1,8 +1,7 @@ -use std::collections::HashMap; use serde_json::json; use tower::BoxError; - +use crate::integration::common::Query; use crate::integration::IntegrationTest; #[cfg(not(feature = "hyper_header_limits"))] @@ -100,11 +99,8 @@ async fn test_supergraph_errors_on_http1_header_that_does_not_fit_inside_buffer( router.start().await; router.assert_started().await; - let mut headers = HashMap::new(); - headers.insert("test-header".to_string(), "x".repeat(1048576 + 1)); - let (_trace_id, response) = router - .execute_query_with_headers(&json!({ "query": "{ __typename }"}), headers) + .execute_query(Query::builder().body(json!({ "query": "{ __typename }"})).header("test-header", "x".repeat(1048576 + 1)).build()) .await; assert_eq!(response.status(), 431); Ok(()) @@ -125,11 +121,8 @@ async fn test_supergraph_allow_to_change_http1_max_buf_size() -> Result<(), BoxE router.start().await; router.assert_started().await; - let mut headers = HashMap::new(); - headers.insert("test-header".to_string(), "x".repeat(1048576 + 1)); - let (_trace_id, response) = router - .execute_query_with_headers(&json!({ "query": "{ __typename }"}), headers) + .execute_query(Query::builder().body(json!({ "query": "{ __typename }"})).header("test-header", "x".repeat(1048576 + 1)).build()) .await; assert_eq!(response.status(), 200); assert_eq!( diff --git a/apollo-router/tests/integration/telemetry/datadog.rs b/apollo-router/tests/integration/telemetry/datadog.rs index 948ff9d165..591a2702fc 100644 --- a/apollo-router/tests/integration/telemetry/datadog.rs +++ b/apollo-router/tests/integration/telemetry/datadog.rs @@ -1,68 +1,43 @@ extern crate core; -use std::collections::HashMap; use std::collections::HashSet; -use std::sync::Arc; -use std::sync::Mutex; -use std::time::Duration; +use std::ops::Deref; use anyhow::anyhow; -use opentelemetry_api::trace::SpanContext; use opentelemetry_api::trace::TraceId; -use serde_json::json; use serde_json::Value; use tower::BoxError; -use wiremock::ResponseTemplate; use crate::integration::common::graph_os_enabled; +use crate::integration::common::Query; use crate::integration::common::Telemetry; +use crate::integration::telemetry::verifier::Verifier; +use crate::integration::telemetry::TraceSpec; use crate::integration::IntegrationTest; use crate::integration::ValueExt; -#[derive(buildstructor::Builder)] -struct TraceSpec { - operation_name: Option, - version: Option, - services: HashSet<&'static str>, - span_names: HashSet<&'static str>, - measured_spans: HashSet<&'static str>, - unmeasured_spans: HashSet<&'static str>, - priority_sampled: Option<&'static str>, - subgraph_sampled: Option, - subgraph_context: Option>>>, - // Not the metrics but the otel attribute - no_priority_sampled_attribute: Option, -} - #[tokio::test(flavor = "multi_thread")] async fn test_no_sample() -> Result<(), BoxError> { if !graph_os_enabled() { return Ok(()); } - let context = std::sync::Arc::new(std::sync::Mutex::new(None)); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Datadog) .config(include_str!("fixtures/datadog_no_sample.router.yaml")) - .responder(ResponseTemplate::new(200).set_body_json( - json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), - )) - .subgraph_context(context.clone()) .build() .await; router.start().await; router.assert_started().await; + TraceSpec::builder() + .services(["router"].into()) + .subgraph_sampled(false) + .priority_sampled("0") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) + .await?; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (_id, result) = router.execute_untraced_query(&query, None).await; router.graceful_shutdown().await; - assert!(result.status().is_success()); - let context = context.lock().expect("poisoned"); - assert!(!context.as_ref().unwrap().is_sampled()); - assert_eq!( - context.as_ref().unwrap().trace_state().get("psr"), - Some("0") - ); Ok(()) } @@ -73,34 +48,25 @@ async fn test_sampling_datadog_agent_disabled() -> Result<(), BoxError> { if !graph_os_enabled() { return Ok(()); } - let context = std::sync::Arc::new(std::sync::Mutex::new(None)); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Datadog) .config(include_str!( "fixtures/datadog_agent_sampling_disabled.router.yaml" )) - .responder(ResponseTemplate::new(200).set_body_json( - json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), - )) - .subgraph_context(context.clone()) .build() .await; router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_untraced_query(&query, None).await; - router.graceful_shutdown().await; - assert!(result.status().is_success()); - tokio::time::sleep(Duration::from_secs(2)).await; - TraceSpec::builder() .services([].into()) .subgraph_sampled(false) .build() - .validate_trace(id) + .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) .await?; + router.graceful_shutdown().await; + Ok(()) } @@ -109,14 +75,9 @@ async fn test_priority_sampling_propagated() -> Result<(), BoxError> { if !graph_os_enabled() { return Ok(()); } - let context = std::sync::Arc::new(std::sync::Mutex::new(None)); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Datadog) .config(include_str!("fixtures/datadog.router.yaml")) - .responder(ResponseTemplate::new(200).set_body_json( - json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), - )) - .subgraph_context(context.clone()) .build() .await; @@ -124,58 +85,46 @@ async fn test_priority_sampling_propagated() -> Result<(), BoxError> { router.assert_started().await; // Parent based sampling. psr MUST be populated with the value that we pass in. - test_psr( - &mut router, - Some("-1"), - TraceSpec::builder() - .services(["client", "router"].into()) - .subgraph_sampled(false) - .priority_sampled("-1") - .build(), - ) - .await?; - test_psr( - &mut router, - Some("0"), - TraceSpec::builder() - .services(["client", "router"].into()) - .subgraph_sampled(false) - .priority_sampled("0") - .build(), - ) - .await?; - test_psr( - &mut router, - Some("1"), - TraceSpec::builder() - .services(["client", "router", "subgraph"].into()) - .subgraph_sampled(true) - .priority_sampled("1") - .build(), - ) - .await?; - test_psr( - &mut router, - Some("2"), - TraceSpec::builder() - .services(["client", "router", "subgraph"].into()) - .subgraph_sampled(true) - .priority_sampled("2") - .build(), - ) - .await?; + TraceSpec::builder() + .services(["client", "router"].into()) + .subgraph_sampled(false) + .priority_sampled("-1") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("-1").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router"].into()) + .subgraph_sampled(false) + .priority_sampled("0") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("0").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .subgraph_sampled(true) + .priority_sampled("1") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("1").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .subgraph_sampled(true) + .priority_sampled("2") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("2").build()) + .await?; // No psr was passed in the router is free to set it. This will be 1 as we are going to sample here. - test_psr( - &mut router, - None, - TraceSpec::builder() - .services(["client", "router", "subgraph"].into()) - .subgraph_sampled(true) - .priority_sampled("1") - .build(), - ) - .await?; + TraceSpec::builder() + .services(["router", "subgraph"].into()) + .subgraph_sampled(true) + .priority_sampled("1") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) + .await?; router.graceful_shutdown().await; @@ -187,35 +136,22 @@ async fn test_priority_sampling_propagated_otel_request() -> Result<(), BoxError if !graph_os_enabled() { return Ok(()); } - let context = std::sync::Arc::new(std::sync::Mutex::new(None)); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Otlp { endpoint: None }) + .extra_propagator(Telemetry::Datadog) .config(include_str!("fixtures/datadog.router.yaml")) - .responder(ResponseTemplate::new(200).set_body_json( - json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), - )) - .subgraph_context(context.clone()) .build() .await; router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert_eq!( - result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .to_str() - .unwrap(), - id.to_datadog() - ); + TraceSpec::builder() .services(["router"].into()) .priority_sampled("1") + .subgraph_sampled(true) .build() - .validate_trace(id) + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) .await?; router.graceful_shutdown().await; @@ -228,16 +164,11 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { if !graph_os_enabled() { return Ok(()); } - let context = std::sync::Arc::new(std::sync::Mutex::new(None)); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Datadog) .config(include_str!( "fixtures/datadog_no_parent_sampler.router.yaml" )) - .responder(ResponseTemplate::new(200).set_body_json( - json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), - )) - .subgraph_context(context.clone()) .build() .await; @@ -245,74 +176,141 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { router.assert_started().await; // The router will ignore the upstream PSR as parent based sampling is disabled. - test_psr( - &mut router, - Some("-1"), - TraceSpec::builder() - .services(["client", "router", "subgraph"].into()) - .priority_sampled("1") - .build(), - ) - .await?; - test_psr( - &mut router, - Some("0"), - TraceSpec::builder() - .services(["client", "router", "subgraph"].into()) - .priority_sampled("1") - .build(), - ) - .await?; - test_psr( - &mut router, - Some("1"), - TraceSpec::builder() - .services(["client", "router", "subgraph"].into()) - .priority_sampled("1") - .build(), - ) - .await?; - test_psr( - &mut router, - Some("2"), - TraceSpec::builder() - .services(["client", "router", "subgraph"].into()) - .priority_sampled("1") - .build(), - ) - .await?; - - test_psr( - &mut router, - None, - TraceSpec::builder() - .services(["client", "router", "subgraph"].into()) - .priority_sampled("1") - .build(), - ) - .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("-1").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("0").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("1").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("2").build()) + .await?; + + TraceSpec::builder() + .services(["router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) + .await?; router.graceful_shutdown().await; Ok(()) } -async fn test_psr( - router: &mut IntegrationTest, - psr: Option<&str>, - trace_spec: TraceSpec, -) -> Result<(), BoxError> { - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let headers = if let Some(psr) = psr { - vec![("x-datadog-sampling-priority".to_string(), psr.to_string())] - } else { - vec![] - }; - let (id, result) = router - .execute_query_with_headers(&query, headers.into_iter().collect()) +#[tokio::test(flavor = "multi_thread")] +async fn test_priority_sampling_parent_sampler_very_small() -> Result<(), BoxError> { + // Note that there is a very small chance this test will fail. We are trying to test a non-zero sampler. + + if !graph_os_enabled() { + return Ok(()); + } + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!( + "fixtures/datadog_parent_sampler_very_small.router.yaml" + )) + .build() + .await; + + router.start().await; + router.assert_started().await; + + // The router should respect upstream but also almost never sample if left to its own devices. + TraceSpec::builder() + .services(["client", "router"].into()) + .priority_sampled("-1") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("-1").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("0").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("1").build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("2") + .subgraph_sampled(true) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).psr("2").build()) + .await?; + + TraceSpec::builder() + .services(["router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) + .await?; + + router.graceful_shutdown().await; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_untraced_request() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!( + "fixtures/datadog_parent_sampler_very_small.router.yaml" + )) + .build() .await; - assert!(result.status().is_success()); - trace_spec.validate_trace(id).await?; + + router.start().await; + router.assert_started().await; + + TraceSpec::builder() + .services(["router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) + .await?; + + router.graceful_shutdown().await; + Ok(()) } @@ -332,20 +330,9 @@ async fn test_default_span_names() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert_eq!( - result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .to_str() - .unwrap(), - id.to_datadog() - ); - router.graceful_shutdown().await; TraceSpec::builder() .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") .span_names( [ "query_planning", @@ -363,8 +350,9 @@ async fn test_default_span_names() -> Result<(), BoxError> { .into(), ) .build() - .validate_trace(id) + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) .await?; + router.graceful_shutdown().await; Ok(()) } @@ -384,20 +372,9 @@ async fn test_override_span_names() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert_eq!( - result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .to_str() - .unwrap(), - id.to_datadog() - ); - router.graceful_shutdown().await; TraceSpec::builder() .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") .span_names( [ "query_planning", @@ -415,8 +392,9 @@ async fn test_override_span_names() -> Result<(), BoxError> { .into(), ) .build() - .validate_trace(id) + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) .await?; + router.graceful_shutdown().await; Ok(()) } @@ -435,21 +413,9 @@ async fn test_override_span_names_late() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert_eq!( - result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .to_str() - .unwrap(), - id.to_datadog() - ); - router.graceful_shutdown().await; TraceSpec::builder() .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") .span_names( [ "query_planning", @@ -467,8 +433,9 @@ async fn test_override_span_names_late() -> Result<(), BoxError> { .into(), ) .build() - .validate_trace(id) + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) .await?; + router.graceful_shutdown().await; Ok(()) } @@ -486,20 +453,9 @@ async fn test_basic() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert_eq!( - result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .to_str() - .unwrap(), - id.to_datadog() - ); - router.graceful_shutdown().await; TraceSpec::builder() .operation_name("ExampleQuery") + .priority_sampled("1") .services(["client", "router", "subgraph"].into()) .span_names( [ @@ -530,8 +486,9 @@ async fn test_basic() -> Result<(), BoxError> { .into(), ) .build() - .validate_trace(id) + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) .await?; + router.graceful_shutdown().await; Ok(()) } @@ -549,23 +506,6 @@ async fn test_with_parent_span() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let mut headers = HashMap::new(); - headers.insert( - "traceparent".to_string(), - String::from("00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01"), - ); - let (id, result) = router.execute_query_with_headers(&query, headers).await; - assert_eq!( - result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .to_str() - .unwrap(), - id.to_datadog() - ); - router.graceful_shutdown().await; TraceSpec::builder() .operation_name("ExampleQuery") .services(["client", "router", "subgraph"].into()) @@ -598,8 +538,18 @@ async fn test_with_parent_span() -> Result<(), BoxError> { .into(), ) .build() - .validate_trace(id) + .validate_datadog_trace( + &mut router, + Query::builder() + .traced(true) + .header( + "traceparent", + "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01", + ) + .build(), + ) .await?; + router.graceful_shutdown().await; Ok(()) } @@ -619,13 +569,6 @@ async fn test_resource_mapping_default() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); TraceSpec::builder() .operation_name("ExampleQuery") .services(["client", "router", "subgraph"].into()) @@ -645,7 +588,7 @@ async fn test_resource_mapping_default() -> Result<(), BoxError> { .into(), ) .build() - .validate_trace(id) + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) .await?; router.graceful_shutdown().await; Ok(()) @@ -667,14 +610,6 @@ async fn test_resource_mapping_override() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - router.graceful_shutdown().await; TraceSpec::builder() .services(["client", "router", "subgraph"].into()) .span_names( @@ -693,8 +628,9 @@ async fn test_resource_mapping_override() -> Result<(), BoxError> { .into(), ) .build() - .validate_trace(id) + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) .await?; + router.graceful_shutdown().await; Ok(()) } @@ -712,14 +648,6 @@ async fn test_span_metrics() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - router.graceful_shutdown().await; TraceSpec::builder() .operation_name("ExampleQuery") .services(["client", "router", "subgraph"].into()) @@ -740,8 +668,9 @@ async fn test_span_metrics() -> Result<(), BoxError> { .measured_span("subgraph") .unmeasured_span("supergraph") .build() - .validate_trace(id) + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) .await?; + router.graceful_shutdown().await; Ok(()) } @@ -755,58 +684,33 @@ impl DatadogId for TraceId { } } -impl TraceSpec { - #[allow(clippy::too_many_arguments)] - async fn validate_trace(&self, id: TraceId) -> Result<(), BoxError> { - let datadog_id = id.to_datadog(); - let url = format!("http://localhost:8126/test/traces?trace_ids={datadog_id}"); - for _ in 0..20 { - if self.find_valid_trace(&url).await.is_ok() { - return Ok(()); - } - tokio::time::sleep(Duration::from_millis(100)).await; - } - self.find_valid_trace(&url).await?; - - if let Some(subgraph_context) = &self.subgraph_context { - let subgraph_context = subgraph_context.lock().expect("poisoned"); - let subgraph_span_context = subgraph_context.as_ref().expect("state").clone(); +struct DatadogTraceSpec { + trace_spec: TraceSpec, +} +impl Deref for DatadogTraceSpec { + type Target = TraceSpec; - assert_eq!( - subgraph_span_context.trace_state().get("psr"), - self.priority_sampled - ); - if let Some(sampled) = self.subgraph_sampled { - assert_eq!(subgraph_span_context.is_sampled(), sampled); - } - } - Ok(()) + fn deref(&self) -> &Self::Target { + &self.trace_spec } +} - #[allow(clippy::too_many_arguments)] - async fn find_valid_trace(&self, url: &str) -> Result<(), BoxError> { - // A valid trace has: - // * All three services - // * The correct spans - // * All spans are parented - // * Required attributes of 'router' span has been set +impl Verifier for DatadogTraceSpec { + fn spec(&self) -> &TraceSpec { + &self.trace_spec + } - // For now just validate service name. - let trace: Value = reqwest::get(url) + async fn get_trace(&self, trace_id: TraceId) -> Result { + let datadog_id = trace_id.to_datadog(); + let url = format!("http://localhost:8126/test/traces?trace_ids={datadog_id}"); + println!("url: {}", url); + let value: serde_json::Value = reqwest::get(url) .await .map_err(|e| anyhow!("failed to contact datadog; {}", e))? .json() - .await?; - tracing::debug!("{}", serde_json::to_string_pretty(&trace)?); - self.verify_trace_participants(&trace)?; - self.verify_spans_present(&trace)?; - self.verify_measured_spans(&trace)?; - self.verify_operation_name(&trace)?; - self.verify_priority_sampled(&trace)?; - self.verify_priority_sampled_attribute(&trace)?; - self.verify_version(&trace)?; - self.verify_span_kinds(&trace)?; - Ok(()) + .await + .map_err(|e| anyhow!("failed to contact datadog; {}", e))?; + Ok(value) } fn verify_version(&self, trace: &Value) -> Result<(), BoxError> { @@ -824,24 +728,6 @@ impl TraceSpec { Ok(()) } - fn verify_measured_spans(&self, trace: &Value) -> Result<(), BoxError> { - for expected in &self.measured_spans { - assert!( - self.measured_span(trace, expected)?, - "missing measured span {}", - expected - ); - } - for unexpected in &self.unmeasured_spans { - assert!( - !self.measured_span(trace, unexpected)?, - "unexpected measured span {}", - unexpected - ); - } - Ok(()) - } - fn measured_span(&self, trace: &Value, name: &str) -> Result { let binding1 = trace.select_path(&format!( "$..[?(@.meta.['otel.original_name'] == '{}')].metrics.['_dd.measured']", @@ -859,17 +745,7 @@ impl TraceSpec { .unwrap_or_default()) } - fn verify_span_kinds(&self, trace: &Value) -> Result<(), BoxError> { - // Validate that the span.kind has been propagated. We can just do this for a selection of spans. - if self.services.contains("router") { - self.validate_span_kind(trace, "router", "server")?; - self.validate_span_kind(trace, "supergraph", "internal")?; - self.validate_span_kind(trace, "http_request", "client")?; - } - Ok(()) - } - - fn verify_trace_participants(&self, trace: &Value) -> Result<(), BoxError> { + fn verify_services(&self, trace: &Value) -> Result<(), BoxError> { let actual_services: HashSet = trace .select_path("$..service")? .into_iter() @@ -897,7 +773,7 @@ impl TraceSpec { .filter_map(|span_name| span_name.as_string()) .collect(); let mut span_names: HashSet<&str> = self.span_names.clone(); - if self.services.contains("client") { + if self.services.contains(&"client") { span_names.insert("client_request"); } tracing::debug!("found spans {:?}", operation_names); @@ -914,6 +790,7 @@ impl TraceSpec { } fn validate_span_kind(&self, trace: &Value, name: &str, kind: &str) -> Result<(), BoxError> { + let binding1 = trace.select_path(&format!( "$..[?(@.meta.['otel.original_name'] == '{}')].meta.['span.kind']", name @@ -979,16 +856,19 @@ impl TraceSpec { Ok(()) } - fn verify_priority_sampled_attribute(&self, trace: &Value) -> Result<(), BoxError> { - if self.no_priority_sampled_attribute.unwrap_or_default() { - let binding = - trace.select_path("$..[?(@.service=='router')].meta['sampling.priority']")?; - if binding.is_empty() { - return Ok(()); - } else { - return Err(BoxError::from("sampling priority attribute exists")); - } - } + fn verify_span_attributes(&self, _trace: &Value) -> Result<(), BoxError> { Ok(()) } } + +impl TraceSpec { + async fn validate_datadog_trace( + self, + router: &mut IntegrationTest, + query: Query, + ) -> Result<(), BoxError> { + DatadogTraceSpec { trace_spec: self } + .validate_trace(router, query) + .await + } +} diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small.router.yaml new file mode 100644 index 0000000000..90a5594503 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small.router.yaml @@ -0,0 +1,29 @@ +telemetry: + exporters: + tracing: + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + format: datadog + propagation: + trace_context: true + jaeger: true + common: + service_name: router + sampler: 0.000000001 + parent_based_sampler: true + resource: + env: local1 + service.version: router_version_override + preview_datadog_agent_sampling: true + datadog: + enabled: true + batch_processor: + scheduled_delay: 100ms + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + diff --git a/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_request_with_zipkin_propagator.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_request_with_zipkin_propagator.router.yaml index 4e31e0d1d6..3bcb4e5db5 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_request_with_zipkin_propagator.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/otlp_datadog_request_with_zipkin_propagator.router.yaml @@ -5,6 +5,7 @@ telemetry: tracing: propagation: zipkin: true + datadog: true trace_context: true common: service_name: router diff --git a/apollo-router/tests/integration/telemetry/jaeger.rs b/apollo-router/tests/integration/telemetry/jaeger.rs index c9e79bd22a..2c22ef4fc5 100644 --- a/apollo-router/tests/integration/telemetry/jaeger.rs +++ b/apollo-router/tests/integration/telemetry/jaeger.rs @@ -1,7 +1,7 @@ extern crate core; use std::collections::HashSet; -use std::time::Duration; +use std::ops::Deref; use anyhow::anyhow; use opentelemetry_api::trace::TraceId; @@ -9,7 +9,9 @@ use serde_json::json; use serde_json::Value; use tower::BoxError; -use crate::integration::common::Telemetry; +use crate::integration::common::{Query, Telemetry}; +use crate::integration::telemetry::verifier::Verifier; +use crate::integration::telemetry::TraceSpec; use crate::integration::IntegrationTest; use crate::integration::ValueExt; @@ -24,22 +26,13 @@ async fn test_reload() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); for _ in 0..2 { - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - validate_trace( - id, - &query, - Some("ExampleQuery"), - &["client", "router", "subgraph"], - false, - ) - .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .operation_name("ExampleQuery") + .build() + .validate_jaeger_trace(&mut router, Query::default()) + .await?; router.touch_config().await; router.assert_reloaded().await; } @@ -58,21 +51,11 @@ async fn test_remote_root() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - validate_trace( - id, - &query, - Some("ExampleQuery"), - &["client", "router", "subgraph"], - false, - ) - .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .build() + .validate_jaeger_trace(&mut router, Query::default()) + .await?; router.graceful_shutdown().await; Ok(()) @@ -89,21 +72,12 @@ async fn test_local_root() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_untraced_query(&query, None).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - validate_trace( - id, - &query, - Some("ExampleQuery"), - &["router", "subgraph"], - false, - ) - .await?; + TraceSpec::builder() + .services(["router", "subgraph"].into()) + .operation_name("ExampleQuery") + .build() + .validate_jaeger_trace(&mut router, Query::builder().traced(false).build()) + .await?; router.graceful_shutdown().await; Ok(()) @@ -120,8 +94,9 @@ async fn test_local_root_no_sample() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (_, response) = router.execute_untraced_query(&query, None).await; + let (_, response) = router + .execute_query(Query::builder().traced(false).build()) + .await; assert!(response.headers().get("apollo-custom-trace-id").is_some()); router.graceful_shutdown().await; @@ -138,19 +113,13 @@ async fn test_local_root_50_percent_sample() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}\n","variables":{}, "operationName": "ExampleQuery"}); for _ in 0..100 { - let (id, result) = router.execute_untraced_query(&query, None).await; - - if result.headers().get("apollo-custom-trace-id").is_some() - && validate_trace( - id, - &query, - Some("ExampleQuery"), - &["router", "subgraph"], - false, - ) + if TraceSpec::builder() + .services(["router", "subgraph"].into()) + .operation_name("ExampleQuery") + .build() + .validate_jaeger_trace(&mut router, Query::builder().traced(false).build()) .await .is_ok() { @@ -176,10 +145,11 @@ async fn test_no_telemetry() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (_, response) = router.execute_untraced_query(&query, None).await; - assert!(response.headers().get("apollo-custom-trace-id").is_none()); - + TraceSpec::builder() + .services(["router", "subgraph"].into()) + .build() + .validate_jaeger_trace(&mut router, Query::builder().traced(false).build()) + .await?; router.graceful_shutdown().await; Ok(()) } @@ -194,22 +164,11 @@ async fn test_default_operation() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery1 {topProducts{name}}","variables":{}}); - - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - validate_trace( - id, - &query, - Some("ExampleQuery1"), - &["client", "router", "subgraph"], - false, - ) - .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .build() + .validate_jaeger_trace(&mut router, Query::default()) + .await?; router.graceful_shutdown().await; Ok(()) } @@ -225,15 +184,11 @@ async fn test_anonymous_operation() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query {topProducts{name}}","variables":{}}); - - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - validate_trace(id, &query, None, &["client", "router", "subgraph"], false).await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .build() + .validate_jaeger_trace(&mut router, Query::builder().build()) + .await?; router.graceful_shutdown().await; Ok(()) } @@ -248,22 +203,15 @@ async fn test_selected_operation() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery1 {topProducts{name}}\nquery ExampleQuery2 {topProducts{name}}","variables":{}, "operationName": "ExampleQuery2"}); - - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - validate_trace( - id, - &query, - Some("ExampleQuery2"), - &["client", "router", "subgraph"], - false, - ) - .await?; + TraceSpec::builder().services(["client", "router", "subgraph"].into()) + .operation_name("ExampleQuery2") + .build() + .validate_jaeger_trace( + &mut router, + Query::builder() + .body(json!({"query":"query ExampleQuery1 {topProducts{name}}\nquery ExampleQuery2 {topProducts{name}}","variables":{}, "operationName": "ExampleQuery2"}) + ).build(), + ).await?; router.graceful_shutdown().await; Ok(()) } @@ -280,16 +228,12 @@ async fn test_span_customization() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, _res) = router.execute_query(&query).await; - validate_trace( - id, - &query, - Some("ExampleQuery"), - &["client", "router", "subgraph"], - true, - ) - .await?; + TraceSpec::builder().services(["client", "router", "subgraph"].into()) + .operation_name("ExampleQuery") + .span_attribute("http.request.method", "POST") + .build() + .validate_jaeger_trace(&mut router, Query::builder().build()) + .await?; router.graceful_shutdown().await; } Ok(()) @@ -305,9 +249,8 @@ async fn test_decimal_trace_id() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery1 {topProducts{name}}","variables":{}}); - let (id, result) = router.execute_query(&query).await; + let (id, result) = router.execute_query(Query::default()).await; let id_from_router: u128 = result .headers() .get("apollo-custom-trace-id") @@ -317,343 +260,191 @@ async fn test_decimal_trace_id() -> Result<(), BoxError> { .parse() .expect("expected decimal trace ID"); assert_eq!(format!("{:x}", id_from_router), id.to_string()); - - validate_trace( - id, - &query, - Some("ExampleQuery1"), - &["client", "router", "subgraph"], - false, - ) - .await?; router.graceful_shutdown().await; Ok(()) } -async fn validate_trace( - id: TraceId, - query: &Value, - operation_name: Option<&str>, - services: &[&'static str], - custom_span_instrumentation: bool, -) -> Result<(), BoxError> { - let params = url::form_urlencoded::Serializer::new(String::new()) - .append_pair("service", services.first().expect("expected root service")) - .finish(); - - let id = id.to_string(); - let url = format!("http://localhost:16686/api/traces/{id}?{params}"); - for _ in 0..10 { - if find_valid_trace( - &url, - query, - operation_name, - services, - custom_span_instrumentation, - ) - .await - .is_ok() - { - return Ok(()); - } - tokio::time::sleep(Duration::from_millis(1000)).await; - } - find_valid_trace( - &url, - query, - operation_name, - services, - custom_span_instrumentation, - ) - .await?; - Ok(()) +struct JaegerTraceSpec { + trace_spec: TraceSpec, } +impl Deref for JaegerTraceSpec { + type Target = TraceSpec; -async fn find_valid_trace( - url: &str, - query: &Value, - operation_name: Option<&str>, - services: &[&'static str], - custom_span_instrumentation: bool, -) -> Result<(), BoxError> { - // A valid trace has: - // * All three services - // * The correct spans - // * All spans are parented - // * Required attributes of 'router' span has been set - let trace: Value = reqwest::get(url) - .await - .map_err(|e| anyhow!("failed to contact jaeger; {}", e))? - .json() - .await?; - tracing::debug!("{}", serde_json::to_string_pretty(&trace)?); - - // Verify that we got all the participants in the trace - verify_trace_participants(&trace, services)?; - - // Verify that we got the expected span operation names - verify_spans_present(&trace, operation_name, services)?; - - // Verify that all spans have a path to the root 'client_request' span - verify_span_parenting(&trace, services)?; + fn deref(&self) -> &Self::Target { + &self.trace_spec + } +} - // Verify that root span fields are present - verify_root_span_fields(&trace, operation_name)?; +impl Verifier for JaegerTraceSpec { + fn spec(&self) -> &TraceSpec { + &self.trace_spec + } - // Verify that supergraph span fields are present - verify_supergraph_span_fields(&trace, query, operation_name, custom_span_instrumentation)?; + fn verify_span_attributes(&self, trace: &Value) -> Result<(), BoxError> { + for (key, value) in &self.span_attributes { + let binding = trace.select_path(&format!( + "$..tags[?(@.key == '{key}')].value"))?; + let actual_value = binding + .first() + .expect("expected binding") + .as_str() + .expect("expected string"); + assert_eq!(actual_value, *value); + } + Ok(()) + } - // Verify that router span fields are present - verify_router_span_fields(&trace, custom_span_instrumentation)?; + async fn get_trace(&self, trace_id: TraceId) -> Result { + let params = url::form_urlencoded::Serializer::new(String::new()) + .append_pair( + "service", + self.trace_spec + .services + .first() + .expect("expected root service"), + ) + .finish(); - Ok(()) -} + let id = trace_id.to_string(); + let url = format!("http://localhost:16686/api/traces/{id}?{params}"); + println!("url: {}", url); + let value: serde_json::Value = reqwest::get(url) + .await + .map_err(|e| anyhow!("failed to contact jaeger; {}", e))? + .json() + .await + .map_err(|e| anyhow!("failed to contact jaeger; {}", e))?; -fn verify_router_span_fields( - trace: &Value, - custom_span_instrumentation: bool, -) -> Result<(), BoxError> { - let router_span = trace.select_path("$..spans[?(@.operationName == 'router')]")?[0]; - // We can't actually assert the values on a span. Only that a field has been set. - assert_eq!( - router_span - .select_path("$.tags[?(@.key == 'client.name')].value")? - .first(), - Some(&&Value::String("custom_name".to_string())) - ); - assert_eq!( - router_span - .select_path("$.tags[?(@.key == 'client.version')].value")? - .first(), - Some(&&Value::String("1.0".to_string())) - ); - assert!(router_span - .select_path("$.logs[*].fields[?(@.key == 'histogram.apollo_router_span')].value")? - .is_empty(),); - assert!(router_span - .select_path("$.logs[*].fields[?(@.key == 'histogram.apollo_router_span')].value")? - .is_empty(),); - if custom_span_instrumentation { - assert_eq!( - router_span - .select_path("$.tags[?(@.key == 'http.request.method')].value")? - .first(), - Some(&&Value::String("POST".to_string())) - ); - assert_eq!( - router_span - .select_path("$.tags[?(@.key == 'http.request.header.x-not-present')].value")? - .first(), - Some(&&Value::String("nope".to_string())) - ); - assert_eq!( - router_span - .select_path( - "$.tags[?(@.key == 'http.request.header.x-my-header-condition')].value" - )? - .first(), - Some(&&Value::String("test".to_string())) - ); - assert_eq!( - router_span - .select_path("$.tags[?(@.key == 'studio.operation.id')].value")? - .first(), - Some(&&Value::String( - "f60e643d7f52ecda23216f86409d7e2e5c3aa68c".to_string() - )) - ); + Ok(value) } - Ok(()) -} - -fn verify_root_span_fields(trace: &Value, operation_name: Option<&str>) -> Result<(), BoxError> { - // We can't actually assert the values on a span. Only that a field has been set. - let root_span_name = operation_name - .map(|name| format!("query {}", name)) - .unwrap_or("query".to_string()); - let request_span = trace.select_path(&format!( - "$..spans[?(@.operationName == '{root_span_name}')]" - ))?[0]; - - if let Some(operation_name) = operation_name { - assert_eq!( - request_span - .select_path("$.tags[?(@.key == 'graphql.operation.name')].value")? - .first(), - Some(&&Value::String(operation_name.to_string())) - ); - } else { - assert!(request_span - .select_path("$.tags[?(@.key == 'graphql.operation.name')].value")? - .first() - .is_none(),); + fn verify_version(&self, trace: &Value) -> Result<(), BoxError> { + if let Some(expected_version) = &self.version { + let binding = trace.select_path("$..version")?; + let version = binding.first(); + assert_eq!( + version + .expect("version expected") + .as_str() + .expect("version must be a string"), + expected_version + ); + } + Ok(()) } - assert_eq!( - request_span - .select_path("$.tags[?(@.key == 'graphql.operation.type')].value")? - .first(), - Some(&&Value::String("query".to_string())) - ); - - Ok(()) -} - -fn verify_supergraph_span_fields( - trace: &Value, - query: &Value, - operation_name: Option<&str>, - custom_span_instrumentation: bool, -) -> Result<(), BoxError> { - // We can't actually assert the values on a span. Only that a field has been set. - let supergraph_span = trace.select_path("$..spans[?(@.operationName == 'supergraph')]")?[0]; - - if let Some(operation_name) = operation_name { - assert_eq!( - supergraph_span - .select_path("$.tags[?(@.key == 'graphql.operation.name')].value")? - .first(), - Some(&&Value::String(operation_name.to_string())) - ); - } else { - assert!(supergraph_span - .select_path("$.tags[?(@.key == 'graphql.operation.name')].value")? + fn measured_span(&self, trace: &Value, name: &str) -> Result { + let binding1 = trace.select_path(&format!( + "$..[?(@.meta.['otel.original_name'] == '{}')].metrics.['_dd.measured']", + name + ))?; + let binding2 = trace.select_path(&format!( + "$..[?(@.name == '{}')].metrics.['_dd.measured']", + name + ))?; + Ok(binding1 .first() - .is_none(),); + .or(binding2.first()) + .and_then(|v| v.as_f64()) + .map(|v| v == 1.0) + .unwrap_or_default()) } - if custom_span_instrumentation { - assert_eq!( - supergraph_span - .select_path("$.tags[?(@.key == 'graphql.operation.type')].value")? - .first(), - Some(&&Value::String("query".to_string())) - ); + + fn verify_services(&self, trace: &Value) -> Result<(), BoxError> { + let actual_services: HashSet = trace + .select_path("$..serviceName")? + .into_iter() + .filter_map(|service| service.as_string()) + .collect(); + tracing::debug!("found services {:?}", actual_services); + + let expected_services = self.trace_spec.services + .iter() + .map(|s| s.to_string()) + .collect::>(); + if actual_services != expected_services { + return Err(BoxError::from(format!( + "incomplete traces, got {actual_services:?} expected {expected_services:?}" + ))); + } + Ok(()) } - assert_eq!( - supergraph_span - .select_path("$.tags[?(@.key == 'graphql.document')].value")? - .first(), - Some(&&Value::String( - query - .as_object() - .expect("should have been an object") - .get("query") - .expect("must have a query") - .as_str() - .expect("must be a string") - .to_string() - )) - ); + fn verify_spans_present(&self, trace: &Value) -> Result<(), BoxError> { - Ok(()) -} + let operation_names: HashSet = trace + .select_path("$..operationName")? + .into_iter() + .filter_map(|span_name| span_name.as_string()) + .collect(); -fn verify_trace_participants(trace: &Value, services: &[&'static str]) -> Result<(), BoxError> { - let actual_services: HashSet = trace - .select_path("$..serviceName")? - .into_iter() - .filter_map(|service| service.as_string()) - .collect(); - tracing::debug!("found services {:?}", actual_services); - - let expected_services = services - .iter() - .map(|s| s.to_string()) - .collect::>(); - if actual_services != expected_services { - return Err(BoxError::from(format!( - "incomplete traces, got {actual_services:?} expected {expected_services:?}" - ))); + let mut span_names: HashSet<&str> = self.span_names.clone(); + if self.services.contains(&"client") { + span_names.insert("client_request"); + } + tracing::debug!("found spans {:?}", operation_names); + let missing_operation_names: Vec<_> = span_names + .iter() + .filter(|o| !operation_names.contains(**o)) + .collect(); + if !missing_operation_names.is_empty() { + return Err(BoxError::from(format!( + "spans did not match, got {operation_names:?}, missing {missing_operation_names:?}" + ))); + } + Ok(()) } - Ok(()) -} -fn verify_spans_present( - trace: &Value, - operation_name: Option<&str>, - services: &[&'static str], -) -> Result<(), BoxError> { - let operation_names: HashSet = trace - .select_path("$..operationName")? - .into_iter() - .filter_map(|span_name| span_name.as_string()) - .collect(); - let mut expected_operation_names: HashSet = HashSet::from( - [ - "execution", - "subgraph server", - operation_name - .map(|name| format!("query {name}")) - .unwrap_or("query".to_string()) - .as_str(), - "supergraph", - "fetch", - //"parse_query", Parse query will only happen once - //"query_planning", query planning will only happen once - "subgraph", - ] - .map(|s| s.into()), - ); - if services.contains(&"client") { - expected_operation_names.insert("client_request".into()); - } - tracing::debug!("found spans {:?}", operation_names); - let missing_operation_names: Vec<_> = expected_operation_names - .iter() - .filter(|o| !operation_names.contains(*o)) - .collect(); - if !missing_operation_names.is_empty() { - return Err(BoxError::from(format!( - "spans did not match, got {operation_names:?}, missing {missing_operation_names:?}" - ))); + fn validate_span_kind(&self, _trace: &Value, _name: &str, _kind: &str) -> Result<(), BoxError> { + Ok(()) } - Ok(()) -} -fn verify_span_parenting(trace: &Value, services: &[&'static str]) -> Result<(), BoxError> { - let root_span = if services.contains(&"client") { - trace.select_path("$..spans[?(@.operationName == 'client_request')]")?[0] - } else { - trace.select_path("$..spans[?(@.operationName == 'query ExampleQuery')]")?[0] - }; - let spans = trace.select_path("$..spans[*]")?; - for span in spans { - let mut span_path = vec![span.select_path("$.operationName")?[0] - .as_str() - .expect("operation name not not found")]; - let mut current = span; - while let Some(parent) = parent_span(trace, current) { - span_path.push( - parent.select_path("$.operationName")?[0] + fn verify_operation_name(&self, trace: &Value) -> Result<(), BoxError> { + if let Some(expected_operation_name) = &self.operation_name { + let binding = + trace.select_path("$..spans[?(@.operationName == 'supergraph')]..tags[?(@.key == 'graphql.operation.name')].value")?; + println!("binding: {:?}", binding); + let operation_name = binding.first(); + assert_eq!( + operation_name + .expect("graphql.operation.name expected") .as_str() - .expect("operation name not not found"), + .expect("graphql.operation.name must be a string"), + expected_operation_name ); - current = parent; } - tracing::debug!("span path to root: '{:?}'", span_path); - if current != root_span { - return Err(BoxError::from(format!( - "span {:?} did not have a path to the root span", - span.select_path("$.operationName")?, - ))); + Ok(()) + } + + fn verify_priority_sampled(&self, trace: &Value) -> Result<(), BoxError> { + if let Some(psr) = self.priority_sampled { + let binding = + trace.select_path("$..[?(@.service=='router')].metrics._sampling_priority_v1")?; + if binding.is_empty() { + return Err(BoxError::from("missing sampling priority")); + } + for sampling_priority in binding { + assert_eq!( + sampling_priority + .as_f64() + .expect("psr not string") + .to_string(), + psr + ); + } } + Ok(()) } - Ok(()) } -fn parent_span<'a>(trace: &'a Value, span: &'a Value) -> Option<&'a Value> { - span.select_path("$.references[?(@.refType == 'CHILD_OF')].spanID") - .ok()? - .into_iter() - .filter_map(|id| id.as_str()) - .filter_map(|id| { - trace - .select_path(&format!("$..spans[?(@.spanID == '{id}')]")) - .ok()? - .into_iter() - .next() - }) - .next() +impl TraceSpec { + async fn validate_jaeger_trace( + self, + router: &mut IntegrationTest, + query: Query, + ) -> Result<(), BoxError> { + JaegerTraceSpec { trace_spec: self } + .validate_trace(router, query) + .await + } } diff --git a/apollo-router/tests/integration/telemetry/logging.rs b/apollo-router/tests/integration/telemetry/logging.rs index 9e41160572..21c19a3246 100644 --- a/apollo-router/tests/integration/telemetry/logging.rs +++ b/apollo-router/tests/integration/telemetry/logging.rs @@ -1,8 +1,7 @@ -use serde_json::json; use tower::BoxError; use uuid::Uuid; -use crate::integration::common::graph_os_enabled; +use crate::integration::common::{graph_os_enabled, Query}; use crate::integration::common::IntegrationTest; use crate::integration::common::Telemetry; @@ -22,16 +21,15 @@ async fn test_json() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("trace_id").await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("span_id").await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains(r#""static_one":"test""#).await; #[cfg(unix)] { - router.execute_query(&query).await; + router.execute_default_query().await; router .assert_log_contains( r#""schema.id":"dd8960ccefda82ca58e8ac0bc266459fd49ee8215fd6b3cc72e7bc3d7f3464b9""#, @@ -39,11 +37,11 @@ async fn test_json() -> Result<(), BoxError> { .await; } - router.execute_query(&query).await; + router.execute_default_query().await; router .assert_log_contains(r#""on_supergraph_response_event":"on_supergraph_event""#) .await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains(r#""response_status":200"#).await; router.graceful_shutdown().await; @@ -66,24 +64,23 @@ async fn test_json_promote_span_attributes() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("trace_id").await; - router.execute_query(&query).await; + router.execute_query(Query::default()).await; router.assert_log_contains("span_id").await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains(r#""static_one":"test""#).await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains(r#""response_status":200"#).await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains(r#""too_big":true"#).await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains(r#""too_big":"nope""#).await; - router.execute_query(&query).await; + router.execute_default_query().await; router .assert_log_contains(r#""graphql.document":"query ExampleQuery {topProducts{name}}""#) .await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_not_contains(r#""should_not_log""#).await; router.assert_log_not_contains(r#""another_one""#).await; router.graceful_shutdown().await; @@ -107,14 +104,13 @@ async fn test_json_uuid_format() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("trace_id").await; - let (trace_id, _) = router.execute_query(&query).await; + let (trace_id, _) = router.execute_default_query().await; router .assert_log_contains(&format!("{}", Uuid::from_bytes(trace_id.to_bytes()))) .await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("span_id").await; router.graceful_shutdown().await; @@ -137,14 +133,13 @@ async fn test_text_uuid_format() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("trace_id").await; - let (trace_id, _) = router.execute_query(&query).await; + let (trace_id, _) = router.execute_default_query().await; router .assert_log_contains(&format!("{}", Uuid::from_bytes(trace_id.to_bytes()))) .await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("span_id").await; router.graceful_shutdown().await; @@ -166,18 +161,17 @@ async fn test_json_sampler_off() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("trace_id").await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("span_id").await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains(r#""static_one":"test""#).await; - router.execute_query(&query).await; + router.execute_default_query().await; router .assert_log_contains(r#""on_supergraph_response_event":"on_supergraph_event""#) .await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains(r#""response_status":200"#).await; router.graceful_shutdown().await; @@ -200,17 +194,16 @@ async fn test_text() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - router.execute_query(&query).await; - router.execute_query(&query).await; + router.execute_query(Query::default()).await; + router.execute_query(Query::default()).await; router.assert_log_contains("trace_id").await; - router.execute_query(&query).await; + router.execute_query(Query::default()).await; router.assert_log_contains("span_id").await; router .assert_log_contains(r#"on_supergraph_response_event=on_supergraph_event"#) .await; - router.execute_query(&query).await; - router.execute_query(&query).await; + router.execute_query(Query::default()).await; + router.execute_query(Query::default()).await; router.assert_log_contains("response_status=200").await; router.graceful_shutdown().await; Ok(()) @@ -231,14 +224,12 @@ async fn test_text_sampler_off() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - router.execute_query(&query).await; - router.execute_query(&query).await; + router.execute_default_query().await; + router.execute_default_query().await; router.assert_log_contains("trace_id").await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("span_id").await; - router.execute_query(&query).await; + router.execute_default_query().await; router.assert_log_contains("response_status=200").await; router.graceful_shutdown().await; Ok(()) diff --git a/apollo-router/tests/integration/telemetry/metrics.rs b/apollo-router/tests/integration/telemetry/metrics.rs index cbce509c13..37b1f1cae9 100644 --- a/apollo-router/tests/integration/telemetry/metrics.rs +++ b/apollo-router/tests/integration/telemetry/metrics.rs @@ -2,7 +2,7 @@ use std::time::Duration; use serde_json::json; -use crate::integration::common::graph_os_enabled; +use crate::integration::common::{graph_os_enabled, Query}; use crate::integration::IntegrationTest; const PROMETHEUS_CONFIG: &str = include_str!("fixtures/prometheus.router.yaml"); @@ -107,7 +107,7 @@ async fn test_subgraph_auth_metrics() { router.assert_reloaded().await; // This one will not be signed, counters shouldn't increment. router - .execute_query(&json! {{ "query": "query { me { name } }"}}) + .execute_query(Query::default()) .await; // Get Prometheus metrics. @@ -137,7 +137,9 @@ async fn test_metrics_bad_query() { router.start().await; router.assert_started().await; // This query won't make it to the supergraph service - router.execute_bad_query().await; + router + .execute_query(Query::default().with_bad_query()) + .await; router.assert_metrics_contains(r#"apollo_router_operations_total{http_response_status_code="400",otel_scope_name="apollo/router"} 1"#, None).await; } @@ -157,7 +159,9 @@ async fn test_bad_queries() { None, ) .await; - router.execute_bad_content_type().await; + router + .execute_query(Query::default().with_bad_content_type()) + .await; router .assert_metrics_contains( @@ -166,7 +170,9 @@ async fn test_bad_queries() { ) .await; - router.execute_bad_query().await; + router + .execute_query(Query::default().with_bad_query()) + .await; router .assert_metrics_contains( r#"apollo_router_http_requests_total{error="Must provide query string",status="400",otel_scope_name="apollo/router"}"#, @@ -174,7 +180,9 @@ async fn test_bad_queries() { ) .await; - router.execute_huge_query().await; + router + .execute_query(Query::default().with_huge_query()) + .await; router .assert_metrics_contains( r#"apollo_router_http_requests_total{error="Request body payload too large",status="413",otel_scope_name="apollo/router"} 1"#, @@ -260,13 +268,15 @@ async fn test_gauges_on_reload() { // Introspection query router - .execute_query(&json!({"query":"{__schema {types {name}}}","variables":{}})) + .execute_query(Query::introspection() + + ) .await; // Persisted query router .execute_query( - &json!({"query": "{__typename}", "variables":{}, "extensions": {"persistedQuery":{"version" : 1, "sha256Hash" : "ecf4edb46db40b5132295c0291d62fb65d6759a9eedfa4d5d612dd5ec54a6b38"}}}) + Query::builder().body(json!({"query": "{__typename}", "variables":{}, "extensions": {"persistedQuery":{"version" : 1, "sha256Hash" : "ecf4edb46db40b5132295c0291d62fb65d6759a9eedfa4d5d612dd5ec54a6b38"}}})).build() ) .await; diff --git a/apollo-router/tests/integration/telemetry/mod.rs b/apollo-router/tests/integration/telemetry/mod.rs index 8df0a1d753..f79382f772 100644 --- a/apollo-router/tests/integration/telemetry/mod.rs +++ b/apollo-router/tests/integration/telemetry/mod.rs @@ -1,3 +1,5 @@ +use std::collections::{HashMap, HashSet}; + #[cfg(any(not(feature = "ci"), all(target_arch = "x86_64", target_os = "linux")))] mod datadog; #[cfg(any(not(feature = "ci"), all(target_arch = "x86_64", target_os = "linux")))] @@ -8,3 +10,19 @@ mod otlp; mod propagation; #[cfg(any(not(feature = "ci"), all(target_arch = "x86_64", target_os = "linux")))] mod zipkin; +mod verifier; + +#[derive(buildstructor::Builder)] +struct TraceSpec { + operation_name: Option, + version: Option, + services: Vec<&'static str>, + span_names: HashSet<&'static str>, + measured_spans: HashSet<&'static str>, + unmeasured_spans: HashSet<&'static str>, + priority_sampled: Option<&'static str>, + subgraph_sampled: Option, + span_attributes: HashMap<&'static str, &'static str> +} + + diff --git a/apollo-router/tests/integration/telemetry/otlp.rs b/apollo-router/tests/integration/telemetry/otlp.rs index 59dc15a518..6947879116 100644 --- a/apollo-router/tests/integration/telemetry/otlp.rs +++ b/apollo-router/tests/integration/telemetry/otlp.rs @@ -1,18 +1,13 @@ extern crate core; -use std::collections::HashMap; use std::collections::HashSet; -use std::sync::Arc; -use std::sync::Mutex; -use std::time::Duration; +use std::ops::Deref; use anyhow::anyhow; -use opentelemetry_api::trace::SpanContext; use opentelemetry_api::trace::TraceId; use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceResponse; use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceResponse; use prost::Message; -use serde_json::json; use serde_json::Value; use tower::BoxError; use wiremock::matchers::method; @@ -21,9 +16,11 @@ use wiremock::Mock; use wiremock::MockServer; use wiremock::ResponseTemplate; -use crate::integration::common::graph_os_enabled; use crate::integration::common::Telemetry; +use crate::integration::common::{graph_os_enabled, Query}; use crate::integration::IntegrationTest; +use crate::integration::telemetry::TraceSpec; +use crate::integration::telemetry::verifier::Verifier; use crate::integration::ValueExt; #[tokio::test(flavor = "multi_thread")] @@ -45,15 +42,8 @@ async fn test_basic() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); for _ in 0..2 { - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - Spec::builder() + TraceSpec::builder() .operation_name("ExampleQuery") .services(["client", "router", "subgraph"].into()) .span_names( @@ -72,12 +62,16 @@ async fn test_basic() -> Result<(), BoxError> { ) .subgraph_sampled(true) .build() - .validate_trace(id, &mock_server) + .validate_otlp_trace( + &mut router, + &mock_server, + Query::default(), + ) .await?; - Spec::builder() + TraceSpec::builder() .service("router") .build() - .validate_metrics(&mock_server) + .validate_otlp_metrics(&mock_server) .await?; router.touch_config().await; router.assert_reloaded().await; @@ -98,6 +92,7 @@ async fn test_otlp_request_with_datadog_propagator() -> Result<(), BoxError> { .telemetry(Telemetry::Otlp { endpoint: Some(format!("{}/v1/traces", mock_server.uri())), }) + .extra_propagator(Telemetry::Datadog) .config(&config) .build() .await; @@ -105,14 +100,16 @@ async fn test_otlp_request_with_datadog_propagator() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, _) = router.execute_query(&query).await; - Spec::builder() + TraceSpec::builder() .services(["client", "router", "subgraph"].into()) .priority_sampled("1") .subgraph_sampled(true) .build() - .validate_trace(id, &mock_server) + .validate_otlp_trace( + &mut router, + &mock_server, + Query::default(), + ) .await?; router.graceful_shutdown().await; Ok(()) @@ -137,13 +134,15 @@ async fn test_otlp_request_with_datadog_propagator_no_agent() -> Result<(), BoxE router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, _) = router.execute_query(&query).await; - Spec::builder() + TraceSpec::builder() .services(["client", "router", "subgraph"].into()) .subgraph_sampled(true) .build() - .validate_trace(id, &mock_server) + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).build(), + ) .await?; router.graceful_shutdown().await; Ok(()) @@ -158,12 +157,11 @@ async fn test_otlp_request_with_zipkin_trace_context_propagator_with_datadog( let mock_server = mock_otlp_server().await; let config = include_str!("fixtures/otlp_datadog_request_with_zipkin_propagator.router.yaml") .replace("", &mock_server.uri()); - let context = Arc::new(Mutex::new(None)); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Otlp { endpoint: Some(format!("{}/v1/traces", mock_server.uri())), }) - .subgraph_context(context.clone()) + .extra_propagator(Telemetry::Datadog) .config(&config) .build() .await; @@ -171,108 +169,99 @@ async fn test_otlp_request_with_zipkin_trace_context_propagator_with_datadog( router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, _) = router.execute_query(&query).await; - - Spec::builder() - .subgraph_context(context.clone()) + TraceSpec::builder() .services(["client", "router", "subgraph"].into()) .priority_sampled("1") .subgraph_sampled(true) .build() - .validate_trace(id, &mock_server) + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).build(), + ) .await?; // ---------------------- zipkin propagator with unsampled trace // Testing for an unsampled trace, so it should be sent to the otlp exporter with sampling priority set 0 // But it shouldn't send the trace to subgraph as the trace is originally not sampled, the main goal is to measure it at the DD agent level - let id = TraceId::from_hex("80f198ee56343ba864fe8b2a57d3eff7").unwrap(); - let headers: HashMap = [ - ( - "X-B3-TraceId".to_string(), - "80f198ee56343ba864fe8b2a57d3eff7".to_string(), - ), - ( - "X-B3-ParentSpanId".to_string(), - "05e3ac9a4f6e3b90".to_string(), - ), - ("X-B3-SpanId".to_string(), "e457b5a2e4d86bd1".to_string()), - ("X-B3-Sampled".to_string(), "0".to_string()), - ] - .into(); - - let (_id, _) = router.execute_untraced_query(&query, Some(headers)).await; - Spec::builder() - .subgraph_context(context.clone()) + TraceSpec::builder() .services(["router"].into()) .priority_sampled("0") - .subgraph_sampled(true) + .subgraph_sampled(false) .build() - .validate_trace(id, &mock_server) + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder() + .traced(false) + .header("X-B3-TraceId", "80f198ee56343ba864fe8b2a57d3eff7") + .header("X-B3-ParentSpanId", "05e3ac9a4f6e3b90") + .header("X-B3-SpanId", "e457b5a2e4d86bd1") + .header("X-B3-Sampled", "0") + .build(), + ) .await?; // ---------------------- trace context propagation // Testing for a trace containing the right tracestate with m and psr for DD and a sampled trace, so it should be sent to the otlp exporter with sampling priority set to 1 // And it should also send the trace to subgraph as the trace is sampled - let id = TraceId::from_hex("0af7651916cd43dd8448eb211c80319c").unwrap(); - let headers: HashMap = [ - ( - "traceparent".to_string(), - "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01".to_string(), - ), - ("tracestate".to_string(), "m=1,psr=1".to_string()), - ] - .into(); - - let (_id, _) = router.execute_untraced_query(&query, Some(headers)).await; - Spec::builder() - .subgraph_context(context.clone()) - .services(["router", "subgraph"].into()) + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) .priority_sampled("1") .subgraph_sampled(true) .build() - .validate_trace(id, &mock_server) + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder() + .traced(true) + .header( + "traceparent", + "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01", + ) + .header("tracestate", "m=1,psr=1") + .build(), + ) .await?; // ---------------------- // Testing for a trace containing the right tracestate with m and psr for DD and an unsampled trace, so it should be sent to the otlp exporter with sampling priority set to 0 // But it shouldn't send the trace to subgraph as the trace is originally not sampled, the main goal is to measure it at the DD agent level - let id = TraceId::from_hex("0af7651916cd43dd8448eb211c80319d").unwrap(); - let headers: HashMap = [ - ( - "traceparent".to_string(), - "00-0af7651916cd43dd8448eb211c80319d-b7ad6b7169203331-00".to_string(), - ), - ("tracestate".to_string(), "m=1,psr=0".to_string()), - ] - .into(); - - let (_id, _) = router.execute_untraced_query(&query, Some(headers)).await; - Spec::builder() - .subgraph_context(context.clone()) + TraceSpec::builder() .services(["router"].into()) .priority_sampled("0") + .subgraph_sampled(false) .build() - .validate_trace(id, &mock_server) + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder() + .traced(false) + .header( + "traceparent", + "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-02", + ) + .header("tracestate", "m=1,psr=0") + .build(), + ) .await?; // ---------------------- // Testing for a trace containing a tracestate m and psr with psr set to 1 for DD and an unsampled trace, so it should be sent to the otlp exporter with sampling priority set to 1 // It should not send the trace to the subgraph as we didn't use the datadog propagator and therefore the trace will remain unsampled. - let id = TraceId::from_hex("0af7651916cd43dd8448eb211c80319e").unwrap(); - let headers: HashMap = [ - ( - "traceparent".to_string(), - "00-0af7651916cd43dd8448eb211c80319e-b7ad6b7169203331-00".to_string(), - ), - ("tracestate".to_string(), "m=1,psr=1".to_string()), - ] - .into(); - - let (_id, _) = router.execute_untraced_query(&query, Some(headers)).await; - Spec::builder() - .subgraph_context(context.clone()) - .services(["router"].into()) + TraceSpec::builder() + .services(["router", "subgraph"].into()) .priority_sampled("1") .subgraph_sampled(true) .build() - .validate_trace(id, &mock_server) + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder() + .traced(false) + .header( + "traceparent", + "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-03", + ) + .header("tracestate", "m=1,psr=1") + .build(), + ) .await?; // Be careful if you add the same kind of test crafting your own trace id, make sure to increment the previous trace id by 1 if not you'll receive all the previous spans tested with the same trace id before @@ -288,25 +277,26 @@ async fn test_untraced_request_no_sample_datadog_agent() -> Result<(), BoxError> let mock_server = mock_otlp_server().await; let config = include_str!("fixtures/otlp_datadog_agent_no_sample.router.yaml") .replace("", &mock_server.uri()); - let context = Arc::new(Mutex::new(None)); - let mut router = IntegrationTest::builder() - .subgraph_context(context.clone()) - .config(&config) - .build() - .await; + let mut router = IntegrationTest::builder().config(&config) + .telemetry(Telemetry::Otlp { + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), + }) + .extra_propagator(Telemetry::Datadog) + .build().await; router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, _) = router.execute_untraced_query(&query, None).await; - Spec::builder() - .subgraph_context(context) + TraceSpec::builder() .services(["router"].into()) .priority_sampled("0") .subgraph_sampled(false) .build() - .validate_trace(id, &mock_server) + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(false).build(), + ) .await?; router.graceful_shutdown().await; Ok(()) @@ -320,25 +310,27 @@ async fn test_untraced_request_sample_datadog_agent() -> Result<(), BoxError> { let mock_server = mock_otlp_server().await; let config = include_str!("fixtures/otlp_datadog_agent_sample.router.yaml") .replace("", &mock_server.uri()); - let context = Arc::new(Mutex::new(None)); let mut router = IntegrationTest::builder() - .subgraph_context(context.clone()) .config(&config) - .build() - .await; + .telemetry(Telemetry::Otlp { + endpoint: Some(format!("{}/v1/traces", mock_server.uri())), + }) + .extra_propagator(Telemetry::Datadog) + .build().await; router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, _) = router.execute_untraced_query(&query, None).await; - Spec::builder() - .subgraph_context(context.clone()) - .services(["router"].into()) + TraceSpec::builder() + .services(["router", "subgraph"].into()) .priority_sampled("1") .subgraph_sampled(true) .build() - .validate_trace(id, &mock_server) + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(false).build(), + ) .await?; router.graceful_shutdown().await; Ok(()) @@ -350,14 +342,13 @@ async fn test_untraced_request_sample_datadog_agent_unsampled() -> Result<(), Bo panic!("Error: test skipped because GraphOS is not enabled"); } let mock_server = mock_otlp_server().await; - let context = Arc::new(Mutex::new(None)); let config = include_str!("fixtures/otlp_datadog_agent_sample_no_sample.router.yaml") .replace("", &mock_server.uri()); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Otlp { endpoint: Some(format!("{}/v1/traces", mock_server.uri())), }) - .subgraph_context(context.clone()) + .extra_propagator(Telemetry::Datadog) .config(&config) .build() .await; @@ -365,15 +356,16 @@ async fn test_untraced_request_sample_datadog_agent_unsampled() -> Result<(), Bo router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let (id, _) = router.execute_untraced_query(&query, None).await; - Spec::builder() + TraceSpec::builder() .services(["router"].into()) .priority_sampled("0") .subgraph_sampled(false) - .subgraph_context(context.clone()) .build() - .validate_trace(id, &mock_server) + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(false).build(), + ) .await?; router.graceful_shutdown().await; Ok(()) @@ -387,7 +379,6 @@ async fn test_priority_sampling_propagated() -> Result<(), BoxError> { let mock_server = mock_otlp_server().await; let config = include_str!("fixtures/otlp_datadog_propagation.router.yaml") .replace("", &mock_server.uri()); - let context = Arc::new(Mutex::new(None)); let mut router = IntegrationTest::builder() // We're using datadog propagation as this is what we are trying to test. .telemetry(Telemetry::Otlp { @@ -395,10 +386,6 @@ async fn test_priority_sampling_propagated() -> Result<(), BoxError> { }) .extra_propagator(Telemetry::Datadog) .config(config) - .responder(ResponseTemplate::new(200).set_body_json( - json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), - )) - .subgraph_context(context.clone()) .build() .await; @@ -406,67 +393,63 @@ async fn test_priority_sampling_propagated() -> Result<(), BoxError> { router.assert_started().await; // Parent based sampling. psr MUST be populated with the value that we pass in. - test_psr( - &mut router, - Some("-1"), - Spec::builder() - .subgraph_context(context.clone()) - .services(["client", "router"].into()) - .priority_sampled("-1") - .subgraph_sampled(false) - .build(), - &mock_server, - ) - .await?; - test_psr( - &mut router, - Some("0"), - Spec::builder() - .subgraph_context(context.clone()) - .services(["client", "router"].into()) - .priority_sampled("0") - .subgraph_sampled(false) - .build(), - &mock_server, - ) - .await?; - test_psr( - &mut router, - Some("1"), - Spec::builder() - .services(["client", "router", "subgraph"].into()) - .priority_sampled("1") - .subgraph_sampled(true) - .build(), - &mock_server, - ) - .await?; - test_psr( - &mut router, - Some("2"), - Spec::builder() - .subgraph_context(context.clone()) - .services(["client", "router", "subgraph"].into()) - .priority_sampled("2") - .subgraph_sampled(true) - .build(), - &mock_server, - ) - .await?; + TraceSpec::builder() + .services(["client", "router"].into()) + .priority_sampled("-1") + .subgraph_sampled(false) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).psr("-1").build(), + ) + .await?; + TraceSpec::builder() + .services(["client", "router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).psr("0").build(), + ) + .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).psr("1").build(), + ) + .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("2") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).psr("2").build(), + ) + .await?; // No psr was passed in the router is free to set it. This will be 1 as we are going to sample here. - test_psr( - &mut router, - None, - Spec::builder() - .subgraph_context(context.clone()) - .services(["client", "router", "subgraph"].into()) - .priority_sampled("1") - .subgraph_sampled(true) - .build(), - &mock_server, - ) - .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).build(), + ) + .await?; router.graceful_shutdown().await; @@ -481,17 +464,12 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { let mock_server = mock_otlp_server().await; let config = include_str!("fixtures/otlp_datadog_propagation_no_parent_sampler.router.yaml") .replace("", &mock_server.uri()); - let context = Arc::new(Mutex::new(None)); let mut router = IntegrationTest::builder() .telemetry(Telemetry::Otlp { endpoint: Some(format!("{}/v1/traces", mock_server.uri())), }) .extra_propagator(Telemetry::Datadog) .config(config) - .responder(ResponseTemplate::new(200).set_body_json( - json!({"data":{"topProducts":[{"name":"Table"},{"name":"Couch"},{"name":"Chair"}]}}), - )) - .subgraph_context(context.clone()) .build() .await; @@ -499,168 +477,138 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { router.assert_started().await; // The router will ignore the upstream PSR as parent based sampling is disabled. - test_psr( - &mut router, - Some("-1"), - Spec::builder() - .subgraph_context(context.clone()) - .services(["client", "router", "subgraph"].into()) - .priority_sampled("1") - .subgraph_sampled(true) - .build(), - &mock_server, - ) - .await?; - test_psr( - &mut router, - Some("0"), - Spec::builder() - .subgraph_context(context.clone()) - .services(["client", "router", "subgraph"].into()) - .priority_sampled("1") - .subgraph_sampled(true) - .build(), - &mock_server, - ) - .await?; - test_psr( - &mut router, - Some("1"), - Spec::builder() - .subgraph_context(context.clone()) - .services(["client", "router", "subgraph"].into()) - .priority_sampled("1") - .subgraph_sampled(true) - .build(), - &mock_server, - ) - .await?; - test_psr( - &mut router, - Some("2"), - Spec::builder() - .subgraph_context(context.clone()) - .services(["client", "router", "subgraph"].into()) - .priority_sampled("1") - .subgraph_sampled(true) - .build(), - &mock_server, - ) - .await?; - - test_psr( - &mut router, - None, - Spec::builder() - .subgraph_context(context.clone()) - .services(["client", "router", "subgraph"].into()) - .priority_sampled("1") - .subgraph_sampled(true) - .build(), - &mock_server, - ) - .await?; - - router.graceful_shutdown().await; - Ok(()) -} + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).psr("-1").build(), + ) + .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).psr("0").build(), + ) + .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).psr("1").build(), + ) + .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).psr("2").build(), + ) + .await?; -async fn test_psr( - router: &mut IntegrationTest, - psr: Option<&str>, - trace_spec: Spec, - mock_server: &MockServer, -) -> Result<(), BoxError> { - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); - let headers = if let Some(psr) = psr { - vec![("x-datadog-sampling-priority".to_string(), psr.to_string())] - } else { - vec![] - }; - let (id, result) = router - .execute_query_with_headers(&query, headers.into_iter().collect()) - .await; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .priority_sampled("1") + .subgraph_sampled(true) + .build() + .validate_otlp_trace( + &mut router, + &mock_server, + Query::builder().traced(true).build(), + ) + .await?; - assert!(result.status().is_success()); + router.graceful_shutdown().await; - trace_spec.validate_trace(id, mock_server).await?; Ok(()) } -#[derive(buildstructor::Builder)] -struct Spec { - subgraph_context: Option>>>, - operation_name: Option, - version: Option, - services: HashSet<&'static str>, - span_names: HashSet<&'static str>, - measured_spans: HashSet<&'static str>, - unmeasured_spans: HashSet<&'static str>, - priority_sampled: Option<&'static str>, - subgraph_sampled: Option, +struct OtlpTraceSpec<'a> { + trace_spec: TraceSpec, + mock_server: &'a MockServer } +impl Deref for OtlpTraceSpec<'_> { + type Target = TraceSpec; -impl Spec { - #[allow(clippy::too_many_arguments)] - async fn validate_trace(&self, id: TraceId, mock_server: &MockServer) -> Result<(), BoxError> { - for _ in 0..10 { - if self.find_valid_trace(id, mock_server).await.is_ok() { - return Ok(()); - } - tokio::time::sleep(Duration::from_millis(100)).await; - } - self.find_valid_trace(id, mock_server).await?; - - if let Some(subgraph_context) = &self.subgraph_context { - let subgraph_context = subgraph_context.lock().expect("poisoned"); - let subgraph_span_context = subgraph_context.as_ref().expect("state").clone(); + fn deref(&self) -> &Self::Target { + &self.trace_spec + } +} - assert_eq!( - subgraph_span_context.trace_state().get("psr"), - self.priority_sampled - ); - if let Some(sampled) = self.subgraph_sampled { - assert_eq!(subgraph_span_context.is_sampled(), sampled); - } - } +impl Verifier for OtlpTraceSpec<'_> { + fn verify_span_attributes(&self, _span: &Value) -> Result<(), BoxError> { + // TODO Ok(()) } + fn spec(&self) -> &TraceSpec { + &self.trace_spec + } - async fn validate_metrics(&self, mock_server: &MockServer) -> Result<(), BoxError> { - for _ in 0..10 { - if self.find_valid_metrics(mock_server).await.is_ok() { - return Ok(()); - } - tokio::time::sleep(Duration::from_millis(100)).await; + fn measured_span(&self, trace: &Value, name: &str) -> Result { + let binding1 = trace.select_path(&format!( + "$..[?(@.meta.['otel.original_name'] == '{}')].metrics.['_dd.measured']", + name + ))?; + let binding2 = trace.select_path(&format!( + "$..[?(@.name == '{}')].metrics.['_dd.measured']", + name + ))?; + Ok(binding1 + .first() + .or(binding2.first()) + .and_then(|v| v.as_f64()) + .map(|v| v == 1.0) + .unwrap_or_default()) + } + + async fn find_valid_metrics(&self) -> Result<(), BoxError> { + let requests = self.mock_server + .received_requests() + .await + .expect("Could not get otlp requests"); + if let Some(metrics) = requests.iter().find(|r| r.url.path().ends_with("/metrics")) { + let metrics = opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest::decode(bytes::Bytes::copy_from_slice(&metrics.body))?; + let json_metrics = serde_json::to_value(metrics)?; + // For now just validate service name. + self.verify_services(&json_metrics)?; + + Ok(()) + } else { + Err(anyhow!("No metrics received").into()) } - self.find_valid_metrics(mock_server).await?; - Ok(()) } - #[allow(clippy::too_many_arguments)] - async fn find_valid_trace( - &self, - trace_id: TraceId, - mock_server: &MockServer, - ) -> Result<(), BoxError> { - // A valid trace has: - // * All three services - // * The correct spans - // * All spans are parented - // * Required attributes of 'router' span has been set - - let requests = mock_server.received_requests().await; - let trace= Value::Array(requests.unwrap_or_default().iter().filter(|r| r.url.path().ends_with("/traces")) - .filter_map(|r|{ + + async fn get_trace(&self, trace_id: TraceId) -> Result { + let requests = self.mock_server.received_requests().await; + let trace = Value::Array(requests.unwrap_or_default().iter().filter(|r| r.url.path().ends_with("/traces")) + .filter_map(|r| { match opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest::decode( bytes::Bytes::copy_from_slice(&r.body), ) { Ok(trace) => { match serde_json::to_value(trace) { Ok(trace) => { - Some(trace) } + Some(trace) + } Err(_) => { None } @@ -671,22 +619,12 @@ impl Spec { } } }).filter(|t| { - let datadog_trace_id = TraceId::from_u128(trace_id.to_datadog() as u128); let trace_found1 = !t.select_path(&format!("$..[?(@.traceId == '{}')]", trace_id)).unwrap_or_default().is_empty(); let trace_found2 = !t.select_path(&format!("$..[?(@.traceId == '{}')]", datadog_trace_id)).unwrap_or_default().is_empty(); trace_found1 | trace_found2 }).collect()); - - self.verify_services(&trace)?; - self.verify_spans_present(&trace)?; - self.verify_measured_spans(&trace)?; - self.verify_operation_name(&trace)?; - self.verify_priority_sampled(&trace)?; - self.verify_version(&trace)?; - self.verify_span_kinds(&trace)?; - - Ok(()) + Ok(trace) } fn verify_version(&self, trace: &Value) -> Result<(), BoxError> { @@ -704,50 +642,9 @@ impl Spec { Ok(()) } - fn verify_measured_spans(&self, trace: &Value) -> Result<(), BoxError> { - for expected in &self.measured_spans { - assert!( - self.measured_span(trace, expected)?, - "missing measured span {}", - expected - ); - } - for unexpected in &self.unmeasured_spans { - assert!( - !self.measured_span(trace, unexpected)?, - "unexpected measured span {}", - unexpected - ); - } - Ok(()) - } - fn measured_span(&self, trace: &Value, name: &str) -> Result { - let binding1 = trace.select_path(&format!( - "$..[?(@.meta.['otel.original_name'] == '{}')].metrics.['_dd.measured']", - name - ))?; - let binding2 = trace.select_path(&format!( - "$..[?(@.name == '{}')].metrics.['_dd.measured']", - name - ))?; - Ok(binding1 - .first() - .or(binding2.first()) - .and_then(|v| v.as_f64()) - .map(|v| v == 1.0) - .unwrap_or_default()) - } - - fn verify_span_kinds(&self, trace: &Value) -> Result<(), BoxError> { - // Validate that the span.kind has been propagated. We can just do this for a selection of spans. - self.validate_span_kind(trace, "router", "server")?; - self.validate_span_kind(trace, "supergraph", "internal")?; - self.validate_span_kind(trace, "http_request", "client")?; - Ok(()) - } - fn verify_services(&self, trace: &Value) -> Result<(), BoxError> { + fn verify_services(&self, trace: &Value) -> Result<(), axum::BoxError> { let actual_services: HashSet = trace .select_path("$..resource.attributes..[?(@.key == 'service.name')].value.stringValue")? .into_iter() @@ -774,7 +671,7 @@ impl Spec { .filter_map(|span_name| span_name.as_string()) .collect(); let mut span_names: HashSet<&str> = self.span_names.clone(); - if self.services.contains("client") { + if self.services.contains(&"client") { span_names.insert("client_request"); } tracing::debug!("found spans {:?}", operation_names); @@ -816,6 +713,7 @@ impl Spec { Ok(()) } + fn verify_operation_name(&self, trace: &Value) -> Result<(), BoxError> { if let Some(expected_operation_name) = &self.operation_name { let binding = @@ -855,22 +753,6 @@ impl Spec { Ok(()) } - async fn find_valid_metrics(&self, mock_server: &MockServer) -> Result<(), BoxError> { - let requests = mock_server - .received_requests() - .await - .expect("Could not get otlp requests"); - if let Some(metrics) = requests.iter().find(|r| r.url.path().ends_with("/metrics")) { - let metrics = opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest::decode(bytes::Bytes::copy_from_slice(&metrics.body))?; - let json_metrics = serde_json::to_value(metrics)?; - // For now just validate service name. - self.verify_services(&json_metrics)?; - - Ok(()) - } else { - Err(anyhow!("No metrics received").into()) - } - } } async fn mock_otlp_server() -> MockServer { @@ -905,3 +787,19 @@ impl DatadogId for TraceId { u64::from_be_bytes(bytes.try_into().unwrap()) } } + + +impl TraceSpec { + async fn validate_otlp_trace(self, router: &mut IntegrationTest, mock_server: &MockServer, query: Query) -> Result<(), BoxError>{ + OtlpTraceSpec { + trace_spec: self, + mock_server + }.validate_trace(router, query).await + } + async fn validate_otlp_metrics(self, mock_server: &MockServer) -> Result<(), BoxError>{ + OtlpTraceSpec { + trace_spec: self, + mock_server + }.validate_metrics().await + } +} \ No newline at end of file diff --git a/apollo-router/tests/integration/telemetry/propagation.rs b/apollo-router/tests/integration/telemetry/propagation.rs index e458f1986c..d1b8258c6f 100644 --- a/apollo-router/tests/integration/telemetry/propagation.rs +++ b/apollo-router/tests/integration/telemetry/propagation.rs @@ -1,9 +1,9 @@ use serde_json::json; use tower::BoxError; -use crate::integration::common::graph_os_enabled; use crate::integration::common::IntegrationTest; use crate::integration::common::Telemetry; +use crate::integration::common::{graph_os_enabled, Query}; #[tokio::test(flavor = "multi_thread")] async fn test_trace_id_via_header() -> Result<(), BoxError> { @@ -12,8 +12,7 @@ async fn test_trace_id_via_header() -> Result<(), BoxError> { return Ok(()); } async fn make_call(router: &mut IntegrationTest, trace_id: &str) { - let _ = router.execute_query_with_headers(&json!({"query":"query {topProducts{name, name, name, name, name, name, name, name, name, name}}","variables":{}}), - [("id_from_header".to_string(), trace_id.to_string())].into()).await; + let _ = router.execute_query(Query::builder().body(json!({"query":"query {topProducts{name, name, name, name, name, name, name, name, name, name}}","variables":{}})).header("id_from_header".to_string(), trace_id.to_string()).build()).await; } let mut router = IntegrationTest::builder() diff --git a/apollo-router/tests/integration/telemetry/verifier.rs b/apollo-router/tests/integration/telemetry/verifier.rs new file mode 100644 index 0000000000..bf4ffe4d70 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/verifier.rs @@ -0,0 +1,160 @@ +use crate::integration::common::Query; +use crate::integration::telemetry::TraceSpec; +use crate::integration::IntegrationTest; +use opentelemetry_api::trace::{SpanContext, TraceId}; +use serde_json::Value; +use std::time::Duration; +use anyhow::anyhow; +use tower::BoxError; + +pub trait Verifier { + fn spec(&self) -> &TraceSpec; + async fn validate_trace(&self, router: &mut IntegrationTest, query: Query) -> Result<(), BoxError> { + let (id, response) = router.execute_query(query).await; + for _ in 0..20 { + if self.find_valid_trace(id).await.is_ok() { + break; + } + tokio::time::sleep(Duration::from_millis(100)).await; + } + self.find_valid_trace(id).await?; + let subgraph_context = router.subgraph_context(); + assert!(response.status().is_success()); + self.validate_subgraph(subgraph_context)?; + Ok(()) + + } + + async fn validate_metrics(&self) -> Result<(), BoxError> { + for _ in 0..10 { + if self.find_valid_metrics().await.is_ok() { + break; + } + tokio::time::sleep(Duration::from_millis(100)).await; + } + self.find_valid_metrics().await?; + Ok(()) + } + + async fn find_valid_metrics(&self) -> Result<(), BoxError> { + unimplemented!("find_valid_metrics") + } + + fn validate_subgraph( + &self, + subgraph_context: SpanContext, + ) -> Result<(), BoxError> { + self.validate_subgraph_priority_sampled(&subgraph_context)?; + self.validate_subgraph_sampled(&subgraph_context)?; + Ok(()) + } + fn validate_subgraph_sampled( + &self, + subgraph_context: &SpanContext, + ) -> Result<(), BoxError> { + if let Some(sampled) = self.spec().priority_sampled { + assert_eq!( + subgraph_context.trace_state().get("psr"), + Some(sampled), + "subgraph psr" + ); + } + + + Ok(()) + } + + fn validate_subgraph_priority_sampled( + &self, + subgraph_context: &SpanContext, + ) -> Result<(), BoxError>{ + if let Some(sampled) = self.spec().subgraph_sampled { + assert_eq!(subgraph_context.is_sampled(), sampled, "subgraph sampled"); + } + Ok(()) + } + + + + #[allow(clippy::too_many_arguments)] + async fn find_valid_trace(&self, trace_id: TraceId) -> Result<(), BoxError> { + // A valid trace has: + // * All three services + // * The correct spans + // * All spans are parented + // * Required attributes of 'router' span has been set + + // For now just validate service name. + let trace: Value = self.get_trace(trace_id).await?; + println!("trace: {}", trace_id); + self.verify_services(&trace)?; + println!("services verified"); + self.verify_spans_present(&trace)?; + println!("spans present verified"); + self.verify_measured_spans(&trace)?; + println!("measured spans verified"); + self.verify_operation_name(&trace)?; + println!("operation name verified"); + self.verify_priority_sampled(&trace)?; + println!("priority sampled verified"); + self.verify_version(&trace)?; + println!("version verified"); + self.verify_span_kinds(&trace)?; + println!("span kinds verified"); + self.verify_span_attributes(&trace)?; + println!("span attributes verified"); + Ok(()) + } + + async fn get_trace(&self, trace_id: TraceId) -> Result; + + fn verify_version(&self, trace: &Value) -> Result<(), BoxError>; + + fn verify_measured_spans(&self, trace: &Value) -> Result<(), BoxError> { + for expected in &self.spec().measured_spans { + let measured = self.measured_span(trace, expected)?; + if !measured { + return Err(anyhow!("missing measured span {}", expected).into()); + } + } + for unexpected in &self.spec().unmeasured_spans { + let measured = self.measured_span(trace, unexpected)?; + if measured { + return Err(anyhow!("unexpected measured span {}", measured).into()); + } + } + Ok(()) + } + + fn measured_span(&self, trace: &Value, name: &str) -> Result; + + fn verify_span_kinds(&self, trace: &Value) -> Result<(), BoxError> { + // Validate that the span.kind has been propagated. We can just do this for a selection of spans. + if self.spec().span_names.contains("router") { + self.validate_span_kind(trace, "router", "server")?; + } + + if self.spec().span_names.contains("supergraph") { + self.validate_span_kind(trace, "supergraph", "internal")?; + } + + if self.spec().span_names.contains("http_request") { + self.validate_span_kind(trace, "http_request", "client")?; + } + + Ok(()) + } + + fn verify_services(&self, trace: &Value) -> Result<(), BoxError>; + + fn verify_spans_present(&self, trace: &Value) -> Result<(), BoxError>; + + fn validate_span_kind(&self, trace: &Value, name: &str, kind: &str) -> Result<(), BoxError>; + + fn verify_span_attributes(&self, trace: &Value) -> Result<(), BoxError>; + + fn verify_operation_name(&self, trace: &Value) -> Result<(), BoxError>; + + fn verify_priority_sampled(&self, trace: &Value) -> Result<(), BoxError>; + +} diff --git a/apollo-router/tests/integration/telemetry/zipkin.rs b/apollo-router/tests/integration/telemetry/zipkin.rs index c0d5e0a8d5..460c10add4 100644 --- a/apollo-router/tests/integration/telemetry/zipkin.rs +++ b/apollo-router/tests/integration/telemetry/zipkin.rs @@ -1,18 +1,18 @@ extern crate core; use std::collections::HashSet; -use std::time::Duration; +use std::ops::Deref; +use crate::integration::common::{Query, Telemetry}; +use crate::integration::telemetry::verifier::Verifier; +use crate::integration::telemetry::TraceSpec; +use crate::integration::IntegrationTest; +use crate::integration::ValueExt; use anyhow::anyhow; use opentelemetry_api::trace::TraceId; -use serde_json::json; use serde_json::Value; use tower::BoxError; -use crate::integration::common::Telemetry; -use crate::integration::IntegrationTest; -use crate::integration::ValueExt; - #[tokio::test(flavor = "multi_thread")] async fn test_basic() -> Result<(), BoxError> { let mut router = IntegrationTest::builder() @@ -24,22 +24,13 @@ async fn test_basic() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let query = json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}}); for _ in 0..2 { - let (id, result) = router.execute_query(&query).await; - assert!(!result - .headers() - .get("apollo-custom-trace-id") - .unwrap() - .is_empty()); - validate_trace( - id, - &query, - Some("ExampleQuery"), - &["client", "router", "subgraph"], - false, - ) - .await?; + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .operation_name("ExampleQuery") + .build() + .validate_zipkin_trace(&mut router, Query::default()) + .await?; router.touch_config().await; router.assert_reloaded().await; } @@ -47,85 +38,115 @@ async fn test_basic() -> Result<(), BoxError> { Ok(()) } -async fn validate_trace( - id: TraceId, - query: &Value, - operation_name: Option<&str>, - services: &[&'static str], - custom_span_instrumentation: bool, -) -> Result<(), BoxError> { - let params = url::form_urlencoded::Serializer::new(String::new()) - .append_pair("service", services.first().expect("expected root service")) - .finish(); - - let url = format!("http://localhost:9411/api/v2/trace/{id}?{params}"); - for _ in 0..10 { - if find_valid_trace( - &url, - query, - operation_name, - services, - custom_span_instrumentation, - ) - .await - .is_ok() - { - return Ok(()); - } - tokio::time::sleep(Duration::from_millis(100)).await; + +struct ZipkinTraceSpec { + trace_spec: TraceSpec, +} +impl Deref for ZipkinTraceSpec { + type Target = TraceSpec; + + fn deref(&self) -> &Self::Target { + &self.trace_spec } - find_valid_trace( - &url, - query, - operation_name, - services, - custom_span_instrumentation, - ) - .await?; - Ok(()) } -async fn find_valid_trace( - url: &str, - _query: &Value, - _operation_name: Option<&str>, - services: &[&'static str], - _custom_span_instrumentation: bool, -) -> Result<(), BoxError> { - // A valid trace has: - // * All three services - // * The correct spans - // * All spans are parented - // * Required attributes of 'router' span has been set - - // For now just validate service name. - let trace: Value = reqwest::get(url) - .await - .map_err(|e| anyhow!("failed to contact zipkin; {}", e))? - .json() - .await?; - tracing::debug!("{}", serde_json::to_string_pretty(&trace)?); - verify_trace_participants(&trace, services)?; +impl Verifier for ZipkinTraceSpec { + fn verify_span_attributes(&self, _trace: &Value) -> Result<(), BoxError> { + Ok(()) + } + fn verify_version(&self, _trace: &Value) -> Result<(), BoxError> { - Ok(()) + Ok(()) + } + + + fn measured_span(&self, _trace: &Value, _name: &str) -> Result { + Ok(true) + } + + fn verify_span_kinds(&self, _trace: &Value) -> Result<(), BoxError> { + Ok(()) + } + + fn verify_services(&self, trace: &Value) -> Result<(), axum::BoxError> { + let actual_services: HashSet = trace + .select_path("$..serviceName")? + .into_iter() + .filter_map(|service| service.as_string()) + .collect(); + tracing::debug!("found services {:?}", actual_services); + + let expected_services = self.trace_spec.services + .iter() + .map(|s| s.to_string()) + .collect::>(); + if actual_services != expected_services { + return Err(BoxError::from(format!( + "incomplete traces, got {actual_services:?} expected {expected_services:?}" + ))); + } + Ok(()) + } + + fn verify_spans_present(&self, _trace: &Value) -> Result<(), BoxError> { + Ok(()) + } + + fn validate_span_kind(&self, _trace: &Value, _name: &str, _kind: &str) -> Result<(), BoxError> { + Ok(()) + } + + fn verify_operation_name(&self, trace: &Value) -> Result<(), BoxError> { + if let Some(expected_operation_name) = &self.operation_name { + let binding = + trace.select_path("$..[?(@.name == 'supergraph')].tags..['graphql.operation.name']")?; + let operation_name = binding.first(); + assert_eq!( + operation_name + .expect("graphql.operation.name expected") + .as_str() + .expect("graphql.operation.name must be a string"), + expected_operation_name + ); + } + Ok(()) + } + + fn verify_priority_sampled(&self, _trace: &Value) -> Result<(), BoxError> { + Ok(()) + } + + + async fn get_trace(&self, trace_id: TraceId) -> Result { + let params = url::form_urlencoded::Serializer::new(String::new()) + .append_pair("service", self.trace_spec.services.first().expect("expected root service")) + .finish(); + + let id = trace_id.to_string(); + let url = format!("http://localhost:9411/api/v2/trace/{id}?{params}"); + println!("url: {}", url); + let value: serde_json::Value = reqwest::get(url) + .await + .map_err(|e| anyhow!("failed to contact datadog; {}", e))? + .json() + .await + .map_err(|e| anyhow!("failed to contact datadog; {}", e))?; + Ok(value) + } + + fn spec(&self) -> &TraceSpec { + &self.trace_spec + } } -fn verify_trace_participants(trace: &Value, services: &[&'static str]) -> Result<(), BoxError> { - let actual_services: HashSet = trace - .select_path("$..serviceName")? - .into_iter() - .filter_map(|service| service.as_string()) - .collect(); - tracing::debug!("found services {:?}", actual_services); - - let expected_services = services - .iter() - .map(|s| s.to_string()) - .collect::>(); - if actual_services != expected_services { - return Err(BoxError::from(format!( - "incomplete traces, got {actual_services:?} expected {expected_services:?}" - ))); +impl TraceSpec { + async fn validate_zipkin_trace( + self, + router: &mut IntegrationTest, + query: Query, + ) -> Result<(), BoxError> { + ZipkinTraceSpec { trace_spec: self } + .validate_trace(router, query) + .await } - Ok(()) } diff --git a/apollo-router/tests/integration/traffic_shaping.rs b/apollo-router/tests/integration/traffic_shaping.rs index feb9a7e725..5d8b45e28b 100644 --- a/apollo-router/tests/integration/traffic_shaping.rs +++ b/apollo-router/tests/integration/traffic_shaping.rs @@ -5,7 +5,7 @@ use serde_json::json; use tower::BoxError; use wiremock::ResponseTemplate; -use crate::integration::common::graph_os_enabled; +use crate::integration::common::{graph_os_enabled, Query}; use crate::integration::common::Telemetry; use crate::integration::IntegrationTest; @@ -99,9 +99,9 @@ async fn test_router_timeout_operation_name_in_tracing() -> Result<(), BoxError> router.assert_started().await; let (_trace_id, response) = router - .execute_query(&json!({ + .execute_query(Query::builder().body(json!({ "query": "query UniqueName { topProducts { name } }" - })) + })).build()) .await; assert_eq!(response.status(), 504); let response = response.text().await?; diff --git a/apollo-router/tests/samples_tests.rs b/apollo-router/tests/samples_tests.rs index 7f06f1d5cc..abc45e2ff1 100644 --- a/apollo-router/tests/samples_tests.rs +++ b/apollo-router/tests/samples_tests.rs @@ -30,6 +30,7 @@ use wiremock::ResponseTemplate; #[path = "./common.rs"] pub(crate) mod common; pub(crate) use common::IntegrationTest; +use crate::common::Query; fn main() -> Result> { let args = Arguments::from_args(); @@ -497,7 +498,7 @@ impl TestExecution { writeln!(out, "header: {:?}\n", headers).unwrap(); let (_, response) = router - .execute_query_with_headers(&request, headers.clone()) + .execute_query(Query::builder().body(request).headers(headers.clone()).build()) .await; writeln!(out, "response headers: {:?}", response.headers()).unwrap(); From 530eecd5c802d2ab4d5a2622c5559146cdfc96b9 Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 13 Dec 2024 11:04:28 +0000 Subject: [PATCH 16/26] Fix coprocessor test --- apollo-router/tests/integration/coprocessor.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apollo-router/tests/integration/coprocessor.rs b/apollo-router/tests/integration/coprocessor.rs index 21c5f0db2b..dd7adcfbbf 100644 --- a/apollo-router/tests/integration/coprocessor.rs +++ b/apollo-router/tests/integration/coprocessor.rs @@ -43,7 +43,7 @@ async fn test_coprocessor_limit_payload() -> Result<(), BoxError> { // Expect a small query Mock::given(method("POST")) .and(path("/")) - .and(body_partial_json(json!({"version":1,"stage":"RouterRequest","control":"continue","body":"{\"query\":\"query {topProducts{name}}\",\"variables\":{}}","method":"POST"}))) + .and(body_partial_json(json!({"version":1,"stage":"RouterRequest","control":"continue","body":"{\"query\":\"query ExampleQuery {topProducts{name}}\",\"variables\":{}}","method":"POST"}))) .respond_with( ResponseTemplate::new(200).set_body_json(json!({"version":1,"stage":"RouterRequest","control":"continue","body":"{\"query\":\"query {topProducts{name}}\",\"variables\":{}}","method":"POST"})), ) From 0a1e1f7cfece3d51e4647e6b6592b7d86d54d2c9 Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 13 Dec 2024 11:07:56 +0000 Subject: [PATCH 17/26] Fix redis test --- apollo-router/tests/common.rs | 5 +++++ apollo-router/tests/integration/redis.rs | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/apollo-router/tests/common.rs b/apollo-router/tests/common.rs index 5374059ccf..6e527834e1 100644 --- a/apollo-router/tests/common.rs +++ b/apollo-router/tests/common.rs @@ -116,6 +116,11 @@ impl Query { self } + pub fn with_anonymous(mut self) -> Self { + self.body = json!({"query":"query {topProducts{name}}","variables":{}}); + self + } + pub fn with_huge_query(mut self) -> Self { self.body = json!({"query":"query {topProducts{name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name}}","variables":{}}); self diff --git a/apollo-router/tests/integration/redis.rs b/apollo-router/tests/integration/redis.rs index f5e3da3438..a19d2fd261 100644 --- a/apollo-router/tests/integration/redis.rs +++ b/apollo-router/tests/integration/redis.rs @@ -41,7 +41,7 @@ use serde_json::Value; use tower::BoxError; use tower::ServiceExt; -use crate::integration::common::graph_os_enabled; +use crate::integration::common::{graph_os_enabled, Query}; use crate::integration::IntegrationTest; #[tokio::test(flavor = "multi_thread")] @@ -1109,11 +1109,11 @@ async fn test_redis_query_plan_config_update(updated_config: &str, new_cache_key ); assert_ne!(starting_key, new_cache_key, "starting_key (cache key for the initial config) and new_cache_key (cache key with the updated config) should not be equal. This either means that the cache key is not being generated correctly, or that the test is not actually checking the updated key."); - router.execute_default_query().await; + router.execute_query(Query::default().with_anonymous()).await; router.assert_redis_cache_contains(starting_key, None).await; router.update_config(updated_config).await; router.assert_reloaded().await; - router.execute_default_query().await; + router.execute_query(Query::default().with_anonymous()).await; router .assert_redis_cache_contains(new_cache_key, Some(starting_key)) .await; From 14c3ed6e81900dbfd265f28bd92f8aaa146b45fe Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 13 Dec 2024 12:02:01 +0000 Subject: [PATCH 18/26] Lint --- apollo-router/tests/common.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/apollo-router/tests/common.rs b/apollo-router/tests/common.rs index 4d53fddf19..7dd7636a00 100644 --- a/apollo-router/tests/common.rs +++ b/apollo-router/tests/common.rs @@ -106,26 +106,31 @@ impl Query { } } impl Query { + #[allow(dead_code)] pub fn with_bad_content_type(mut self) -> Self { self.content_type = "garbage".to_string(); self } + #[allow(dead_code)] pub fn with_bad_query(mut self) -> Self { self.body = json!({"garbage":{}}); self } + #[allow(dead_code)] pub fn with_anonymous(mut self) -> Self { self.body = json!({"query":"query {topProducts{name}}","variables":{}}); self } + #[allow(dead_code)] pub fn with_huge_query(mut self) -> Self { self.body = json!({"query":"query {topProducts{name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name, name}}","variables":{}}); self } + #[allow(dead_code)] pub fn introspection() -> Query { Query::builder() .body(json!({"query":"{__schema {types {name}}}","variables":{}})) @@ -492,6 +497,7 @@ impl IntegrationTest { Dispatch::new(subscriber) } + #[allow(dead_code)] pub fn subgraph_context(&self) -> SpanContext { self.subgraph_context .lock() @@ -627,12 +633,14 @@ impl IntegrationTest { fs::copy(supergraph_path, &self.test_schema_location).expect("could not write schema"); } + #[allow(dead_code)] pub fn execute_default_query( &self, ) -> impl std::future::Future { self.execute_query(Query::builder().build()) } + #[allow(dead_code)] pub fn execute_query( &self, query: Query, From 1c0a232e6c0295d8b141ee7d5bb65991180ea43c Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 13 Dec 2024 13:50:54 +0000 Subject: [PATCH 19/26] Lint --- apollo-router/tests/common.rs | 11 ++- apollo-router/tests/integration/batching.rs | 7 +- .../tests/integration/coprocessor.rs | 7 +- .../tests/integration/introspection.rs | 5 +- .../tests/integration/operation_limits.rs | 9 ++- .../query_planner/max_evaluated_plans.rs | 25 +++++-- apollo-router/tests/integration/redis.rs | 11 ++- .../tests/integration/subgraph_response.rs | 69 ++++++++++++----- .../tests/integration/subscription.rs | 8 +- apollo-router/tests/integration/supergraph.rs | 16 +++- .../tests/integration/telemetry/datadog.rs | 1 - .../tests/integration/telemetry/jaeger.rs | 75 +++++++++++++------ .../tests/integration/telemetry/logging.rs | 3 +- .../tests/integration/telemetry/metrics.rs | 13 +--- .../tests/integration/telemetry/mod.rs | 37 +++++++-- .../tests/integration/telemetry/otlp.rs | 63 ++++++++-------- .../integration/telemetry/propagation.rs | 3 +- .../tests/integration/telemetry/verifier.rs | 36 ++++----- .../tests/integration/telemetry/zipkin.rs | 32 ++++---- .../tests/integration/traffic_shaping.rs | 13 +++- apollo-router/tests/samples_tests.rs | 8 +- 21 files changed, 294 insertions(+), 158 deletions(-) diff --git a/apollo-router/tests/common.rs b/apollo-router/tests/common.rs index 7dd7636a00..e70cf5f0de 100644 --- a/apollo-router/tests/common.rs +++ b/apollo-router/tests/common.rs @@ -8,7 +8,7 @@ use std::sync::Arc; use std::sync::Mutex; use std::time::Duration; -use buildstructor::{buildstructor}; +use buildstructor::buildstructor; use fred::clients::RedisClient; use fred::interfaces::ClientLike; use fred::interfaces::KeysInterface; @@ -335,7 +335,11 @@ impl Telemetry { } } - pub(crate) fn extract_context(&self, request: &wiremock::Request, context: &Context) -> Context { + pub(crate) fn extract_context( + &self, + request: &wiremock::Request, + context: &Context, + ) -> Context { let headers: HashMap = request .headers .iter() @@ -362,8 +366,7 @@ impl Telemetry { .expect("psr"); let new_trace_id = if original_span_context.is_valid() { original_span_context.trace_id() - } - else { + } else { context.span().span_context().trace_id() }; context = context.with_remote_span_context(SpanContext::new( diff --git a/apollo-router/tests/integration/batching.rs b/apollo-router/tests/integration/batching.rs index f9e7ba6ab8..521e615b30 100644 --- a/apollo-router/tests/integration/batching.rs +++ b/apollo-router/tests/integration/batching.rs @@ -856,7 +856,8 @@ mod helper { use wiremock::ResponseTemplate; use super::test_is_enabled; - use crate::integration::common::{IntegrationTest, Query}; + use crate::integration::common::IntegrationTest; + use crate::integration::common::Query; /// Helper type for specifying a valid handler pub type Handler = fn(&wiremock::Request) -> ResponseTemplate; @@ -916,7 +917,9 @@ mod helper { // Execute the request let request = serde_json::to_value(requests)?; - let (_span, response) = router.execute_query(Query::builder().body(request).build()).await; + let (_span, response) = router + .execute_query(Query::builder().body(request).build()) + .await; serde_json::from_slice::>(&response.bytes().await?).map_err(BoxError::from) } diff --git a/apollo-router/tests/integration/coprocessor.rs b/apollo-router/tests/integration/coprocessor.rs index dd7adcfbbf..d82d15ca7c 100644 --- a/apollo-router/tests/integration/coprocessor.rs +++ b/apollo-router/tests/integration/coprocessor.rs @@ -7,7 +7,8 @@ use wiremock::matchers::path; use wiremock::Mock; use wiremock::ResponseTemplate; -use crate::integration::common::{graph_os_enabled, Query}; +use crate::integration::common::graph_os_enabled; +use crate::integration::common::Query; use crate::integration::IntegrationTest; #[tokio::test(flavor = "multi_thread")] @@ -75,7 +76,9 @@ async fn test_coprocessor_limit_payload() -> Result<(), BoxError> { assert_eq!(response.status(), 200); // This query is huge and will be rejected because it is too large before hitting the coprocessor - let (_trace_id, response) = router.execute_query(Query::default().with_huge_query()).await; + let (_trace_id, response) = router + .execute_query(Query::default().with_huge_query()) + .await; assert_eq!(response.status(), 413); assert_yaml_snapshot!(response.text().await?); diff --git a/apollo-router/tests/integration/introspection.rs b/apollo-router/tests/integration/introspection.rs index 56b2a496cd..8ad142a9cb 100644 --- a/apollo-router/tests/integration/introspection.rs +++ b/apollo-router/tests/integration/introspection.rs @@ -1,10 +1,11 @@ -use crate::integration::common::Query; -use crate::integration::IntegrationTest; use apollo_router::plugin::test::MockSubgraph; use apollo_router::services::supergraph::Request; use serde_json::json; use tower::ServiceExt; +use crate::integration::common::Query; +use crate::integration::IntegrationTest; + #[tokio::test] async fn simple() { let request = Request::fake_builder() diff --git a/apollo-router/tests/integration/operation_limits.rs b/apollo-router/tests/integration/operation_limits.rs index b0c5b25802..1b6b186e41 100644 --- a/apollo-router/tests/integration/operation_limits.rs +++ b/apollo-router/tests/integration/operation_limits.rs @@ -9,6 +9,7 @@ use apollo_router::TestHarness; use serde_json::json; use tower::BoxError; use tower::ServiceExt; + use crate::integration::common::Query; use crate::integration::IntegrationTest; @@ -310,7 +311,9 @@ async fn test_request_bytes_limit_with_coprocessor() -> Result<(), BoxError> { .await; router.start().await; router.assert_started().await; - let (_, resp) = router.execute_query(Query::default().with_huge_query()).await; + let (_, resp) = router + .execute_query(Query::default().with_huge_query()) + .await; assert_eq!(resp.status(), 413); router.graceful_shutdown().await; Ok(()) @@ -324,7 +327,9 @@ async fn test_request_bytes_limit() -> Result<(), BoxError> { .await; router.start().await; router.assert_started().await; - let (_, resp) = router.execute_query(Query::default().with_huge_query()).await; + let (_, resp) = router + .execute_query(Query::default().with_huge_query()) + .await; assert_eq!(resp.status(), 413); router.graceful_shutdown().await; Ok(()) diff --git a/apollo-router/tests/integration/query_planner/max_evaluated_plans.rs b/apollo-router/tests/integration/query_planner/max_evaluated_plans.rs index f3edb84232..6326d600ee 100644 --- a/apollo-router/tests/integration/query_planner/max_evaluated_plans.rs +++ b/apollo-router/tests/integration/query_planner/max_evaluated_plans.rs @@ -1,4 +1,5 @@ use serde_json::json; + use crate::integration::common::Query; use crate::integration::IntegrationTest; @@ -31,10 +32,14 @@ async fn reports_evaluated_plans() { router.start().await; router.assert_started().await; router - .execute_query(Query::builder().body(json!({ - "query": r#"{ t { v1 v2 v3 v4 } }"#, - "variables": {}, - })).build()) + .execute_query( + Query::builder() + .body(json!({ + "query": r#"{ t { v1 v2 v3 v4 } }"#, + "variables": {}, + })) + .build(), + ) .await; let metrics = router @@ -70,10 +75,14 @@ async fn does_not_exceed_max_evaluated_plans() { router.start().await; router.assert_started().await; router - .execute_query(Query::builder().body(json!({ - "query": r#"{ t { v1 v2 v3 v4 } }"#, - "variables": {}, - })).build()) + .execute_query( + Query::builder() + .body(json!({ + "query": r#"{ t { v1 v2 v3 v4 } }"#, + "variables": {}, + })) + .build(), + ) .await; let metrics = router diff --git a/apollo-router/tests/integration/redis.rs b/apollo-router/tests/integration/redis.rs index ea9f37be62..07d16d7b92 100644 --- a/apollo-router/tests/integration/redis.rs +++ b/apollo-router/tests/integration/redis.rs @@ -40,7 +40,8 @@ use serde_json::Value; use tower::BoxError; use tower::ServiceExt; -use crate::integration::common::{graph_os_enabled, Query}; +use crate::integration::common::graph_os_enabled; +use crate::integration::common::Query; use crate::integration::IntegrationTest; #[tokio::test(flavor = "multi_thread")] @@ -1072,11 +1073,15 @@ async fn test_redis_query_plan_config_update(updated_config: &str, new_cache_key ); assert_ne!(starting_key, new_cache_key, "starting_key (cache key for the initial config) and new_cache_key (cache key with the updated config) should not be equal. This either means that the cache key is not being generated correctly, or that the test is not actually checking the updated key."); - router.execute_query(Query::default().with_anonymous()).await; + router + .execute_query(Query::default().with_anonymous()) + .await; router.assert_redis_cache_contains(starting_key, None).await; router.update_config(updated_config).await; router.assert_reloaded().await; - router.execute_query(Query::default().with_anonymous()).await; + router + .execute_query(Query::default().with_anonymous()) + .await; router .assert_redis_cache_contains(new_cache_key, Some(starting_key)) .await; diff --git a/apollo-router/tests/integration/subgraph_response.rs b/apollo-router/tests/integration/subgraph_response.rs index cba16ca370..3f0f194d92 100644 --- a/apollo-router/tests/integration/subgraph_response.rs +++ b/apollo-router/tests/integration/subgraph_response.rs @@ -1,6 +1,7 @@ use serde_json::json; use tower::BoxError; use wiremock::ResponseTemplate; + use crate::integration::common::Query; use crate::integration::IntegrationTest; @@ -21,7 +22,9 @@ async fn test_subgraph_returning_data_null() -> Result<(), BoxError> { router.assert_started().await; let query = "{ __typename topProducts { name } }"; - let (_trace_id, response) = router.execute_query(Query::builder().body(json!({ "query": query })).build()).await; + let (_trace_id, response) = router + .execute_query(Query::builder().body(json!({ "query": query })).build()) + .await; assert_eq!(response.status(), 200); assert_eq!( response.json::().await?, @@ -64,7 +67,9 @@ async fn test_subgraph_returning_different_typename_on_query_root() -> Result<() inside_fragment: __typename } "#; - let (_trace_id, response) = router.execute_query(Query::builder().body(json!({ "query": query })).build()).await; + let (_trace_id, response) = router + .execute_query(Query::builder().body(json!({ "query": query })).build()) + .await; assert_eq!(response.status(), 200); assert_eq!( response.json::().await?, @@ -99,7 +104,11 @@ async fn test_valid_extensions_service_for_subgraph_error() -> Result<(), BoxErr router.assert_started().await; let (_trace_id, response) = router - .execute_query(Query::builder().body(json!({ "query": "{ topProducts { name } }" })).build()) + .execute_query( + Query::builder() + .body(json!({ "query": "{ topProducts { name } }" })) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -141,7 +150,11 @@ async fn test_valid_extensions_service_is_preserved_for_subgraph_error() -> Resu router.assert_started().await; let (_trace_id, response) = router - .execute_query(Query::builder().body(json!({ "query": "{ topProducts { name } }" })).build()) + .execute_query( + Query::builder() + .body(json!({ "query": "{ topProducts { name } }" })) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -174,7 +187,11 @@ async fn test_valid_extensions_service_for_invalid_subgraph_response() -> Result router.assert_started().await; let (_trace_id, response) = router - .execute_query(Query::builder().body(json!({ "query": "{ topProducts { name } }" })).build()) + .execute_query( + Query::builder() + .body(json!({ "query": "{ topProducts { name } }" })) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -222,7 +239,11 @@ async fn test_valid_error_locations() -> Result<(), BoxError> { router.assert_started().await; let (_trace_id, response) = router - .execute_query(Query::builder().body(json!({ "query": "{ topProducts { name } }" })).build()) + .execute_query( + Query::builder() + .body(json!({ "query": "{ topProducts { name } }" })) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -264,7 +285,11 @@ async fn test_empty_error_locations() -> Result<(), BoxError> { router.assert_started().await; let (_trace_id, response) = router - .execute_query(Query::builder().body(json!({ "query": "{ topProducts { name } }" })).build()) + .execute_query( + Query::builder() + .body(json!({ "query": "{ topProducts { name } }" })) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -302,7 +327,11 @@ async fn test_invalid_error_locations() -> Result<(), BoxError> { router.assert_started().await; let (_trace_id, response) = router - .execute_query(Query::builder().body(json!({ "query": "{ topProducts { name } }" })).build()) + .execute_query( + Query::builder() + .body(json!({ "query": "{ topProducts { name } }" })) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -345,7 +374,11 @@ async fn test_invalid_error_locations_with_single_negative_one_location() -> Res router.assert_started().await; let (_trace_id, response) = router - .execute_query(Query::builder().body(json!({ "query": "{ topProducts { name } }" })).build()) + .execute_query( + Query::builder() + .body(json!({ "query": "{ topProducts { name } }" })) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -387,7 +420,11 @@ async fn test_invalid_error_locations_contains_negative_one_location() -> Result router.assert_started().await; let (_trace_id, response) = router - .execute_query(Query::builder().body(json!({ "query": "{ topProducts { name } }" })).build()) + .execute_query( + Query::builder() + .body(json!({ "query": "{ topProducts { name } }" })) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( @@ -427,9 +464,7 @@ async fn test_valid_error_path() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let (_trace_id, response) = router - .execute_query(Query::default()) - .await; + let (_trace_id, response) = router.execute_query(Query::default()).await; assert_eq!(response.status(), 200); assert_eq!( response.json::().await?, @@ -464,9 +499,7 @@ async fn test_invalid_error_path() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let (_trace_id, response) = router - .execute_query(Query::default()) - .await; + let (_trace_id, response) = router.execute_query(Query::default()).await; assert_eq!(response.status(), 200); assert_eq!( response.json::().await?, @@ -502,9 +535,7 @@ async fn test_partially_valid_error_path() -> Result<(), BoxError> { router.start().await; router.assert_started().await; - let (_trace_id, response) = router - .execute_query(Query::default()) - .await; + let (_trace_id, response) = router.execute_query(Query::default()).await; assert_eq!(response.status(), 200); assert_eq!( response.json::().await?, diff --git a/apollo-router/tests/integration/subscription.rs b/apollo-router/tests/integration/subscription.rs index 74c42f2034..faad126f8e 100644 --- a/apollo-router/tests/integration/subscription.rs +++ b/apollo-router/tests/integration/subscription.rs @@ -4,7 +4,8 @@ use http::HeaderValue; use serde_json::json; use tower::BoxError; -use super::common::{IntegrationTest, Query}; +use super::common::IntegrationTest; +use super::common::Query; use super::common::Telemetry; const SUBSCRIPTION_CONFIG: &str = include_str!("../fixtures/subscription.router.yaml"); @@ -59,7 +60,10 @@ async fn test_subscription_load() -> Result<(), BoxError> { for _ in 0..100 { let (_id, resp) = router - .execute_query(Query::builder().body(json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}})).build(), + .execute_query( + Query::builder() + .body(json!({"query":"query ExampleQuery {topProducts{name}}","variables":{}})) + .build(), ) .await; assert!(resp.status().is_success()); diff --git a/apollo-router/tests/integration/supergraph.rs b/apollo-router/tests/integration/supergraph.rs index 5732b0921d..8d7ae9727b 100644 --- a/apollo-router/tests/integration/supergraph.rs +++ b/apollo-router/tests/integration/supergraph.rs @@ -1,6 +1,6 @@ - use serde_json::json; use tower::BoxError; + use crate::integration::common::Query; use crate::integration::IntegrationTest; @@ -100,7 +100,12 @@ async fn test_supergraph_errors_on_http1_header_that_does_not_fit_inside_buffer( router.assert_started().await; let (_trace_id, response) = router - .execute_query(Query::builder().body(json!({ "query": "{ __typename }"})).header("test-header", "x".repeat(1048576 + 1)).build()) + .execute_query( + Query::builder() + .body(json!({ "query": "{ __typename }"})) + .header("test-header", "x".repeat(1048576 + 1)) + .build(), + ) .await; assert_eq!(response.status(), 431); Ok(()) @@ -122,7 +127,12 @@ async fn test_supergraph_allow_to_change_http1_max_buf_size() -> Result<(), BoxE router.assert_started().await; let (_trace_id, response) = router - .execute_query(Query::builder().body(json!({ "query": "{ __typename }"})).header("test-header", "x".repeat(1048576 + 1)).build()) + .execute_query( + Query::builder() + .body(json!({ "query": "{ __typename }"})) + .header("test-header", "x".repeat(1048576 + 1)) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( diff --git a/apollo-router/tests/integration/telemetry/datadog.rs b/apollo-router/tests/integration/telemetry/datadog.rs index 591a2702fc..23bbc32652 100644 --- a/apollo-router/tests/integration/telemetry/datadog.rs +++ b/apollo-router/tests/integration/telemetry/datadog.rs @@ -790,7 +790,6 @@ impl Verifier for DatadogTraceSpec { } fn validate_span_kind(&self, trace: &Value, name: &str, kind: &str) -> Result<(), BoxError> { - let binding1 = trace.select_path(&format!( "$..[?(@.meta.['otel.original_name'] == '{}')].meta.['span.kind']", name diff --git a/apollo-router/tests/integration/telemetry/jaeger.rs b/apollo-router/tests/integration/telemetry/jaeger.rs index b18039bf26..2fc5e8a9f6 100644 --- a/apollo-router/tests/integration/telemetry/jaeger.rs +++ b/apollo-router/tests/integration/telemetry/jaeger.rs @@ -9,7 +9,8 @@ use serde_json::json; use serde_json::Value; use tower::BoxError; -use crate::integration::common::{Query, Telemetry}; +use crate::integration::common::Query; +use crate::integration::common::Telemetry; use crate::integration::telemetry::verifier::Verifier; use crate::integration::telemetry::TraceSpec; use crate::integration::IntegrationTest; @@ -258,31 +259,50 @@ async fn test_span_attributes() -> Result<(), BoxError> { TraceSpec::builder() .services(["client", "router", "subgraph"].into()) .operation_name("ExampleQuery") - .span_attribute("router", [("http.request.method", "POST"), - ("http.response.status_code", "200"), - ("url.path", "/"), - ("http.request.header.x-my-header", "test"), - ("http.request.header.x-not-present", "nope"), - ("http.request.header.x-my-header-condition", "test"), - ("studio.operation.id", "*"), - ].into()) - .span_attribute("supergraph", [ - ("graphql.operation.name", "ExampleQuery"), - ("graphql.operation.type", "query"), - ("graphql.document", "query ExampleQuery {topProducts{name}}"), - ].into()) - .span_attribute("subgraph", [ - ("subgraph.graphql.operation.type", "query"), - ("subgraph.name", "products")].into()) + .span_attribute( + "router", + [ + ("http.request.method", "POST"), + ("http.response.status_code", "200"), + ("url.path", "/"), + ("http.request.header.x-my-header", "test"), + ("http.request.header.x-not-present", "nope"), + ("http.request.header.x-my-header-condition", "test"), + ("studio.operation.id", "*"), + ] + .into(), + ) + .span_attribute( + "supergraph", + [ + ("graphql.operation.name", "ExampleQuery"), + ("graphql.operation.type", "query"), + ("graphql.document", "query ExampleQuery {topProducts{name}}"), + ] + .into(), + ) + .span_attribute( + "subgraph", + [ + ("subgraph.graphql.operation.type", "query"), + ("subgraph.name", "products"), + ] + .into(), + ) .build() - .validate_jaeger_trace(&mut router, Query::builder().header("x-my-header", "test").header("x-my-header-condition", "condition").build()) + .validate_jaeger_trace( + &mut router, + Query::builder() + .header("x-my-header", "test") + .header("x-my-header-condition", "condition") + .build(), + ) .await?; router.graceful_shutdown().await; } Ok(()) } - #[tokio::test(flavor = "multi_thread")] async fn test_decimal_trace_id() -> Result<(), BoxError> { let mut router = IntegrationTest::builder() @@ -327,19 +347,26 @@ impl Verifier for JaegerTraceSpec { fn verify_span_attributes(&self, trace: &Value) -> Result<(), BoxError> { for (span, attributes) in &self.span_attributes { for (key, value) in attributes { - let binding = trace.select_path(&format!("$..spans[?(@.operationName == '{span}')]..tags..[?(@.key == '{key}')].value"))?; + let binding = trace.select_path(&format!( + "$..spans[?(@.operationName == '{span}')]..tags..[?(@.key == '{key}')].value" + ))?; let actual_value = binding .first() - .expect(&format!("could not find attribute {key} on {span}")); + .unwrap_or_else(|| panic!("could not find attribute {key} on {span}")); match actual_value { Value::String(_) if *value == "*" => continue, - Value::String(s) => assert_eq!(s, value, "unexpected attribute {key} on {span}"), + Value::String(s) => { + assert_eq!(s, value, "unexpected attribute {key} on {span}") + } Value::Number(_) if *value == "*" => continue, - Value::Number(n) => assert_eq!(n.to_string(), *value, "unexpected attribute {key} on {span}"), + Value::Number(n) => assert_eq!( + n.to_string(), + *value, + "unexpected attribute {key} on {span}" + ), _ => panic!("unexpected value type"), } - } } Ok(()) diff --git a/apollo-router/tests/integration/telemetry/logging.rs b/apollo-router/tests/integration/telemetry/logging.rs index 21c19a3246..59dc1c7ccd 100644 --- a/apollo-router/tests/integration/telemetry/logging.rs +++ b/apollo-router/tests/integration/telemetry/logging.rs @@ -1,8 +1,9 @@ use tower::BoxError; use uuid::Uuid; -use crate::integration::common::{graph_os_enabled, Query}; +use crate::integration::common::graph_os_enabled; use crate::integration::common::IntegrationTest; +use crate::integration::common::Query; use crate::integration::common::Telemetry; #[tokio::test(flavor = "multi_thread")] diff --git a/apollo-router/tests/integration/telemetry/metrics.rs b/apollo-router/tests/integration/telemetry/metrics.rs index f63613fa00..56a5d6223d 100644 --- a/apollo-router/tests/integration/telemetry/metrics.rs +++ b/apollo-router/tests/integration/telemetry/metrics.rs @@ -2,7 +2,8 @@ use std::time::Duration; use serde_json::json; -use crate::integration::common::{graph_os_enabled, Query}; +use crate::integration::common::graph_os_enabled; +use crate::integration::common::Query; use crate::integration::IntegrationTest; const PROMETHEUS_CONFIG: &str = include_str!("fixtures/prometheus.router.yaml"); @@ -106,9 +107,7 @@ async fn test_subgraph_auth_metrics() { router.update_config(PROMETHEUS_CONFIG).await; router.assert_reloaded().await; // This one will not be signed, counters shouldn't increment. - router - .execute_query(Query::default()) - .await; + router.execute_query(Query::default()).await; // Get Prometheus metrics. let metrics_response = router.get_metrics_response().await.unwrap(); @@ -267,11 +266,7 @@ async fn test_gauges_on_reload() { router.execute_default_query().await; // Introspection query - router - .execute_query(Query::introspection() - - ) - .await; + router.execute_query(Query::introspection()).await; // Persisted query router diff --git a/apollo-router/tests/integration/telemetry/mod.rs b/apollo-router/tests/integration/telemetry/mod.rs index 2769f18d9e..c814faa7ff 100644 --- a/apollo-router/tests/integration/telemetry/mod.rs +++ b/apollo-router/tests/integration/telemetry/mod.rs @@ -1,4 +1,5 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; +use std::collections::HashSet; #[cfg(any(not(feature = "ci"), all(target_arch = "x86_64", target_os = "linux")))] mod datadog; @@ -8,11 +9,10 @@ mod logging; mod metrics; mod otlp; mod propagation; +mod verifier; #[cfg(any(not(feature = "ci"), all(target_arch = "x86_64", target_os = "linux")))] mod zipkin; -mod verifier; -#[derive(buildstructor::Builder)] struct TraceSpec { operation_name: Option, version: Option, @@ -22,7 +22,34 @@ struct TraceSpec { unmeasured_spans: HashSet<&'static str>, priority_sampled: Option<&'static str>, subgraph_sampled: Option, - span_attributes: HashMap<&'static str, Vec<(&'static str, &'static str)>> + span_attributes: HashMap<&'static str, Vec<(&'static str, &'static str)>>, } - +#[buildstructor::buildstructor] +impl TraceSpec { + #[allow(clippy::too_many_arguments)] + #[builder] + pub fn new( + operation_name: Option, + version: Option, + services: Vec<&'static str>, + span_names: HashSet<&'static str>, + measured_spans: HashSet<&'static str>, + unmeasured_spans: HashSet<&'static str>, + priority_sampled: Option<&'static str>, + subgraph_sampled: Option, + span_attributes: HashMap<&'static str, Vec<(&'static str, &'static str)>>, + ) -> Self { + Self { + operation_name, + version, + services, + span_names, + measured_spans, + unmeasured_spans, + priority_sampled, + subgraph_sampled, + span_attributes, + } + } +} diff --git a/apollo-router/tests/integration/telemetry/otlp.rs b/apollo-router/tests/integration/telemetry/otlp.rs index 6947879116..02a7ce3093 100644 --- a/apollo-router/tests/integration/telemetry/otlp.rs +++ b/apollo-router/tests/integration/telemetry/otlp.rs @@ -16,11 +16,12 @@ use wiremock::Mock; use wiremock::MockServer; use wiremock::ResponseTemplate; +use crate::integration::common::graph_os_enabled; +use crate::integration::common::Query; use crate::integration::common::Telemetry; -use crate::integration::common::{graph_os_enabled, Query}; -use crate::integration::IntegrationTest; -use crate::integration::telemetry::TraceSpec; use crate::integration::telemetry::verifier::Verifier; +use crate::integration::telemetry::TraceSpec; +use crate::integration::IntegrationTest; use crate::integration::ValueExt; #[tokio::test(flavor = "multi_thread")] @@ -62,11 +63,7 @@ async fn test_basic() -> Result<(), BoxError> { ) .subgraph_sampled(true) .build() - .validate_otlp_trace( - &mut router, - &mock_server, - Query::default(), - ) + .validate_otlp_trace(&mut router, &mock_server, Query::default()) .await?; TraceSpec::builder() .service("router") @@ -105,11 +102,7 @@ async fn test_otlp_request_with_datadog_propagator() -> Result<(), BoxError> { .priority_sampled("1") .subgraph_sampled(true) .build() - .validate_otlp_trace( - &mut router, - &mock_server, - Query::default(), - ) + .validate_otlp_trace(&mut router, &mock_server, Query::default()) .await?; router.graceful_shutdown().await; Ok(()) @@ -277,12 +270,14 @@ async fn test_untraced_request_no_sample_datadog_agent() -> Result<(), BoxError> let mock_server = mock_otlp_server().await; let config = include_str!("fixtures/otlp_datadog_agent_no_sample.router.yaml") .replace("", &mock_server.uri()); - let mut router = IntegrationTest::builder().config(&config) + let mut router = IntegrationTest::builder() + .config(&config) .telemetry(Telemetry::Otlp { endpoint: Some(format!("{}/v1/traces", mock_server.uri())), }) .extra_propagator(Telemetry::Datadog) - .build().await; + .build() + .await; router.start().await; router.assert_started().await; @@ -316,7 +311,8 @@ async fn test_untraced_request_sample_datadog_agent() -> Result<(), BoxError> { endpoint: Some(format!("{}/v1/traces", mock_server.uri())), }) .extra_propagator(Telemetry::Datadog) - .build().await; + .build() + .await; router.start().await; router.assert_started().await; @@ -542,7 +538,7 @@ async fn test_priority_sampling_no_parent_propagated() -> Result<(), BoxError> { struct OtlpTraceSpec<'a> { trace_spec: TraceSpec, - mock_server: &'a MockServer + mock_server: &'a MockServer, } impl Deref for OtlpTraceSpec<'_> { type Target = TraceSpec; @@ -552,7 +548,6 @@ impl Deref for OtlpTraceSpec<'_> { } } - impl Verifier for OtlpTraceSpec<'_> { fn verify_span_attributes(&self, _span: &Value) -> Result<(), BoxError> { // TODO @@ -580,7 +575,8 @@ impl Verifier for OtlpTraceSpec<'_> { } async fn find_valid_metrics(&self) -> Result<(), BoxError> { - let requests = self.mock_server + let requests = self + .mock_server .received_requests() .await .expect("Could not get otlp requests"); @@ -596,7 +592,6 @@ impl Verifier for OtlpTraceSpec<'_> { } } - async fn get_trace(&self, trace_id: TraceId) -> Result { let requests = self.mock_server.received_requests().await; let trace = Value::Array(requests.unwrap_or_default().iter().filter(|r| r.url.path().ends_with("/traces")) @@ -642,8 +637,6 @@ impl Verifier for OtlpTraceSpec<'_> { Ok(()) } - - fn verify_services(&self, trace: &Value) -> Result<(), axum::BoxError> { let actual_services: HashSet = trace .select_path("$..resource.attributes..[?(@.key == 'service.name')].value.stringValue")? @@ -713,7 +706,6 @@ impl Verifier for OtlpTraceSpec<'_> { Ok(()) } - fn verify_operation_name(&self, trace: &Value) -> Result<(), BoxError> { if let Some(expected_operation_name) = &self.operation_name { let binding = @@ -752,7 +744,6 @@ impl Verifier for OtlpTraceSpec<'_> { } Ok(()) } - } async fn mock_otlp_server() -> MockServer { @@ -788,18 +779,26 @@ impl DatadogId for TraceId { } } - impl TraceSpec { - async fn validate_otlp_trace(self, router: &mut IntegrationTest, mock_server: &MockServer, query: Query) -> Result<(), BoxError>{ + async fn validate_otlp_trace( + self, + router: &mut IntegrationTest, + mock_server: &MockServer, + query: Query, + ) -> Result<(), BoxError> { OtlpTraceSpec { trace_spec: self, - mock_server - }.validate_trace(router, query).await + mock_server, + } + .validate_trace(router, query) + .await } - async fn validate_otlp_metrics(self, mock_server: &MockServer) -> Result<(), BoxError>{ + async fn validate_otlp_metrics(self, mock_server: &MockServer) -> Result<(), BoxError> { OtlpTraceSpec { trace_spec: self, - mock_server - }.validate_metrics().await + mock_server, + } + .validate_metrics() + .await } -} \ No newline at end of file +} diff --git a/apollo-router/tests/integration/telemetry/propagation.rs b/apollo-router/tests/integration/telemetry/propagation.rs index d1b8258c6f..9505efa558 100644 --- a/apollo-router/tests/integration/telemetry/propagation.rs +++ b/apollo-router/tests/integration/telemetry/propagation.rs @@ -1,9 +1,10 @@ use serde_json::json; use tower::BoxError; +use crate::integration::common::graph_os_enabled; use crate::integration::common::IntegrationTest; +use crate::integration::common::Query; use crate::integration::common::Telemetry; -use crate::integration::common::{graph_os_enabled, Query}; #[tokio::test(flavor = "multi_thread")] async fn test_trace_id_via_header() -> Result<(), BoxError> { diff --git a/apollo-router/tests/integration/telemetry/verifier.rs b/apollo-router/tests/integration/telemetry/verifier.rs index bf4ffe4d70..59cbdf2683 100644 --- a/apollo-router/tests/integration/telemetry/verifier.rs +++ b/apollo-router/tests/integration/telemetry/verifier.rs @@ -1,15 +1,22 @@ -use crate::integration::common::Query; -use crate::integration::telemetry::TraceSpec; -use crate::integration::IntegrationTest; -use opentelemetry_api::trace::{SpanContext, TraceId}; -use serde_json::Value; use std::time::Duration; + use anyhow::anyhow; +use opentelemetry_api::trace::SpanContext; +use opentelemetry_api::trace::TraceId; +use serde_json::Value; use tower::BoxError; +use crate::integration::common::Query; +use crate::integration::telemetry::TraceSpec; +use crate::integration::IntegrationTest; + pub trait Verifier { fn spec(&self) -> &TraceSpec; - async fn validate_trace(&self, router: &mut IntegrationTest, query: Query) -> Result<(), BoxError> { + async fn validate_trace( + &self, + router: &mut IntegrationTest, + query: Query, + ) -> Result<(), BoxError> { let (id, response) = router.execute_query(query).await; for _ in 0..20 { if self.find_valid_trace(id).await.is_ok() { @@ -22,7 +29,6 @@ pub trait Verifier { assert!(response.status().is_success()); self.validate_subgraph(subgraph_context)?; Ok(()) - } async fn validate_metrics(&self) -> Result<(), BoxError> { @@ -40,18 +46,12 @@ pub trait Verifier { unimplemented!("find_valid_metrics") } - fn validate_subgraph( - &self, - subgraph_context: SpanContext, - ) -> Result<(), BoxError> { + fn validate_subgraph(&self, subgraph_context: SpanContext) -> Result<(), BoxError> { self.validate_subgraph_priority_sampled(&subgraph_context)?; self.validate_subgraph_sampled(&subgraph_context)?; Ok(()) } - fn validate_subgraph_sampled( - &self, - subgraph_context: &SpanContext, - ) -> Result<(), BoxError> { + fn validate_subgraph_sampled(&self, subgraph_context: &SpanContext) -> Result<(), BoxError> { if let Some(sampled) = self.spec().priority_sampled { assert_eq!( subgraph_context.trace_state().get("psr"), @@ -60,22 +60,19 @@ pub trait Verifier { ); } - Ok(()) } fn validate_subgraph_priority_sampled( &self, subgraph_context: &SpanContext, - ) -> Result<(), BoxError>{ + ) -> Result<(), BoxError> { if let Some(sampled) = self.spec().subgraph_sampled { assert_eq!(subgraph_context.is_sampled(), sampled, "subgraph sampled"); } Ok(()) } - - #[allow(clippy::too_many_arguments)] async fn find_valid_trace(&self, trace_id: TraceId) -> Result<(), BoxError> { // A valid trace has: @@ -156,5 +153,4 @@ pub trait Verifier { fn verify_operation_name(&self, trace: &Value) -> Result<(), BoxError>; fn verify_priority_sampled(&self, trace: &Value) -> Result<(), BoxError>; - } diff --git a/apollo-router/tests/integration/telemetry/zipkin.rs b/apollo-router/tests/integration/telemetry/zipkin.rs index 460c10add4..45f51620d0 100644 --- a/apollo-router/tests/integration/telemetry/zipkin.rs +++ b/apollo-router/tests/integration/telemetry/zipkin.rs @@ -3,16 +3,18 @@ extern crate core; use std::collections::HashSet; use std::ops::Deref; -use crate::integration::common::{Query, Telemetry}; -use crate::integration::telemetry::verifier::Verifier; -use crate::integration::telemetry::TraceSpec; -use crate::integration::IntegrationTest; -use crate::integration::ValueExt; use anyhow::anyhow; use opentelemetry_api::trace::TraceId; use serde_json::Value; use tower::BoxError; +use crate::integration::common::Query; +use crate::integration::common::Telemetry; +use crate::integration::telemetry::verifier::Verifier; +use crate::integration::telemetry::TraceSpec; +use crate::integration::IntegrationTest; +use crate::integration::ValueExt; + #[tokio::test(flavor = "multi_thread")] async fn test_basic() -> Result<(), BoxError> { let mut router = IntegrationTest::builder() @@ -38,7 +40,6 @@ async fn test_basic() -> Result<(), BoxError> { Ok(()) } - struct ZipkinTraceSpec { trace_spec: TraceSpec, } @@ -55,11 +56,9 @@ impl Verifier for ZipkinTraceSpec { Ok(()) } fn verify_version(&self, _trace: &Value) -> Result<(), BoxError> { - Ok(()) } - fn measured_span(&self, _trace: &Value, _name: &str) -> Result { Ok(true) } @@ -76,7 +75,9 @@ impl Verifier for ZipkinTraceSpec { .collect(); tracing::debug!("found services {:?}", actual_services); - let expected_services = self.trace_spec.services + let expected_services = self + .trace_spec + .services .iter() .map(|s| s.to_string()) .collect::>(); @@ -98,8 +99,8 @@ impl Verifier for ZipkinTraceSpec { fn verify_operation_name(&self, trace: &Value) -> Result<(), BoxError> { if let Some(expected_operation_name) = &self.operation_name { - let binding = - trace.select_path("$..[?(@.name == 'supergraph')].tags..['graphql.operation.name']")?; + let binding = trace + .select_path("$..[?(@.name == 'supergraph')].tags..['graphql.operation.name']")?; let operation_name = binding.first(); assert_eq!( operation_name @@ -116,10 +117,15 @@ impl Verifier for ZipkinTraceSpec { Ok(()) } - async fn get_trace(&self, trace_id: TraceId) -> Result { let params = url::form_urlencoded::Serializer::new(String::new()) - .append_pair("service", self.trace_spec.services.first().expect("expected root service")) + .append_pair( + "service", + self.trace_spec + .services + .first() + .expect("expected root service"), + ) .finish(); let id = trace_id.to_string(); diff --git a/apollo-router/tests/integration/traffic_shaping.rs b/apollo-router/tests/integration/traffic_shaping.rs index 5d8b45e28b..579cb2b2a5 100644 --- a/apollo-router/tests/integration/traffic_shaping.rs +++ b/apollo-router/tests/integration/traffic_shaping.rs @@ -5,7 +5,8 @@ use serde_json::json; use tower::BoxError; use wiremock::ResponseTemplate; -use crate::integration::common::{graph_os_enabled, Query}; +use crate::integration::common::graph_os_enabled; +use crate::integration::common::Query; use crate::integration::common::Telemetry; use crate::integration::IntegrationTest; @@ -99,9 +100,13 @@ async fn test_router_timeout_operation_name_in_tracing() -> Result<(), BoxError> router.assert_started().await; let (_trace_id, response) = router - .execute_query(Query::builder().body(json!({ - "query": "query UniqueName { topProducts { name } }" - })).build()) + .execute_query( + Query::builder() + .body(json!({ + "query": "query UniqueName { topProducts { name } }" + })) + .build(), + ) .await; assert_eq!(response.status(), 504); let response = response.text().await?; diff --git a/apollo-router/tests/samples_tests.rs b/apollo-router/tests/samples_tests.rs index abc45e2ff1..5beba9d4b5 100644 --- a/apollo-router/tests/samples_tests.rs +++ b/apollo-router/tests/samples_tests.rs @@ -30,6 +30,7 @@ use wiremock::ResponseTemplate; #[path = "./common.rs"] pub(crate) mod common; pub(crate) use common::IntegrationTest; + use crate::common::Query; fn main() -> Result> { @@ -498,7 +499,12 @@ impl TestExecution { writeln!(out, "header: {:?}\n", headers).unwrap(); let (_, response) = router - .execute_query(Query::builder().body(request).headers(headers.clone()).build()) + .execute_query( + Query::builder() + .body(request) + .headers(headers.clone()) + .build(), + ) .await; writeln!(out, "response headers: {:?}", response.headers()).unwrap(); From 437f0531859c94494da519c146f0710814e71a1c Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 13 Dec 2024 14:13:10 +0000 Subject: [PATCH 20/26] Fix a couple of new tests from merge --- apollo-router/tests/integration/supergraph.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/apollo-router/tests/integration/supergraph.rs b/apollo-router/tests/integration/supergraph.rs index 8d7ae9727b..07b4c81089 100644 --- a/apollo-router/tests/integration/supergraph.rs +++ b/apollo-router/tests/integration/supergraph.rs @@ -1,3 +1,6 @@ +#[cfg(feature = "hyper_header_limits")] +use std::collections::HashMap; + use serde_json::json; use tower::BoxError; @@ -45,7 +48,12 @@ async fn test_supergraph_errors_on_http1_max_headers() -> Result<(), BoxError> { } let (_trace_id, response) = router - .execute_query_with_headers(&json!({ "query": "{ __typename }"}), headers) + .execute_query( + Query::builder() + .body(json!({ "query": "{ __typename }"})) + .headers(headers) + .build(), + ) .await; assert_eq!(response.status(), 431); Ok(()) @@ -73,7 +81,12 @@ async fn test_supergraph_allow_to_change_http1_max_headers() -> Result<(), BoxEr } let (_trace_id, response) = router - .execute_query_with_headers(&json!({ "query": "{ __typename }"}), headers) + .execute_query( + Query::builder() + .body(json!({ "query": "{ __typename }"})) + .headers(headers) + .build(), + ) .await; assert_eq!(response.status(), 200); assert_eq!( From e701fda2126a06f6301fb068febb8c3d5758d10d Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 13 Dec 2024 14:42:14 +0000 Subject: [PATCH 21/26] Make test less flaky --- apollo-router/tests/integration/telemetry/jaeger.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apollo-router/tests/integration/telemetry/jaeger.rs b/apollo-router/tests/integration/telemetry/jaeger.rs index 2fc5e8a9f6..8c38c59ec2 100644 --- a/apollo-router/tests/integration/telemetry/jaeger.rs +++ b/apollo-router/tests/integration/telemetry/jaeger.rs @@ -482,8 +482,10 @@ impl Verifier for JaegerTraceSpec { if let Some(expected_operation_name) = &self.operation_name { let binding = trace.select_path("$..spans[?(@.operationName == 'supergraph')]..tags[?(@.key == 'graphql.operation.name')].value")?; - println!("binding: {:?}", binding); let operation_name = binding.first(); + if operation_name.is_none() { + return Err(BoxError::from("graphql.operation.name not found")); + } assert_eq!( operation_name .expect("graphql.operation.name expected") From be6842057cca4bcca5df27727bd35320b84b5e96 Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 13 Dec 2024 20:09:13 +0000 Subject: [PATCH 22/26] Revert change to propagator order --- apollo-router/src/plugins/telemetry/mod.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/apollo-router/src/plugins/telemetry/mod.rs b/apollo-router/src/plugins/telemetry/mod.rs index 95b8083525..e70b2cf55c 100644 --- a/apollo-router/src/plugins/telemetry/mod.rs +++ b/apollo-router/src/plugins/telemetry/mod.rs @@ -947,18 +947,21 @@ impl Telemetry { if propagation.zipkin || tracing.zipkin.enabled { propagators.push(Box::::default()); } + if propagation.datadog || tracing.datadog.enabled() { + propagators.push(Box::::default()); + } if propagation.aws_xray { propagators.push(Box::::default()); } + + // This propagator MUST come last because the user is trying to override the default behavior of the + // other propagators. if let Some(from_request_header) = &propagation.request.header_name { propagators.push(Box::new(CustomTraceIdPropagator::new( from_request_header.to_string(), propagation.request.format.clone(), ))); } - if propagation.datadog || tracing.datadog.enabled() { - propagators.push(Box::::default()); - } TextMapCompositePropagator::new(propagators) } From 1e6b5593246773c246822980f2a8e428d8d0a488 Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 13 Dec 2024 20:27:26 +0000 Subject: [PATCH 23/26] Add test for custom propagation overriding the DD propagator --- .../tests/integration/telemetry/datadog.rs | 33 +++++++++++++++++++ ...dog_header_propagator_override.router.yaml | 29 ++++++++++++++++ .../tests/integration/telemetry/mod.rs | 3 ++ .../tests/integration/telemetry/verifier.rs | 3 ++ 4 files changed, 68 insertions(+) create mode 100644 apollo-router/tests/integration/telemetry/fixtures/datadog_header_propagator_override.router.yaml diff --git a/apollo-router/tests/integration/telemetry/datadog.rs b/apollo-router/tests/integration/telemetry/datadog.rs index 23bbc32652..b0427ec691 100644 --- a/apollo-router/tests/integration/telemetry/datadog.rs +++ b/apollo-router/tests/integration/telemetry/datadog.rs @@ -439,6 +439,39 @@ async fn test_override_span_names_late() -> Result<(), BoxError> { Ok(()) } +#[tokio::test(flavor = "multi_thread")] +async fn test_header_propagator_override() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!( + "fixtures/datadog_header_propagator_override.router.yaml" + )) + .build() + .await; + + router.start().await; + router.assert_started().await; + TraceSpec::builder() + .services(["router", "subgraph"].into()) + .subgraph_sampled(true) + .trace_id("00000000000000000000000000000001") + .build() + .validate_datadog_trace( + &mut router, + Query::builder() + .header("trace-id", "00000000000000000000000000000001") + .header("x-datadog-trace-id", "2") + .traced(false) + .build(), + ) + .await?; + router.graceful_shutdown().await; + Ok(()) +} + #[tokio::test(flavor = "multi_thread")] async fn test_basic() -> Result<(), BoxError> { if !graph_os_enabled() { diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_header_propagator_override.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_header_propagator_override.router.yaml new file mode 100644 index 0000000000..595639f1ff --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_header_propagator_override.router.yaml @@ -0,0 +1,29 @@ +telemetry: + exporters: + tracing: + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + format: datadog + propagation: + datadog: true + request: + header_name: trace-id + common: + service_name: router + parent_based_sampler: false + resource: + env: local1 + service.version: router_version_override + preview_datadog_agent_sampling: true + datadog: + enabled: true + batch_processor: + scheduled_delay: 100ms + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true + diff --git a/apollo-router/tests/integration/telemetry/mod.rs b/apollo-router/tests/integration/telemetry/mod.rs index c814faa7ff..4edf023702 100644 --- a/apollo-router/tests/integration/telemetry/mod.rs +++ b/apollo-router/tests/integration/telemetry/mod.rs @@ -22,6 +22,7 @@ struct TraceSpec { unmeasured_spans: HashSet<&'static str>, priority_sampled: Option<&'static str>, subgraph_sampled: Option, + trace_id: Option<&'static str>, span_attributes: HashMap<&'static str, Vec<(&'static str, &'static str)>>, } @@ -38,6 +39,7 @@ impl TraceSpec { unmeasured_spans: HashSet<&'static str>, priority_sampled: Option<&'static str>, subgraph_sampled: Option, + trace_id: Option<&'static str>, span_attributes: HashMap<&'static str, Vec<(&'static str, &'static str)>>, ) -> Self { Self { @@ -50,6 +52,7 @@ impl TraceSpec { priority_sampled, subgraph_sampled, span_attributes, + trace_id, } } } diff --git a/apollo-router/tests/integration/telemetry/verifier.rs b/apollo-router/tests/integration/telemetry/verifier.rs index 59cbdf2683..c6f92cddef 100644 --- a/apollo-router/tests/integration/telemetry/verifier.rs +++ b/apollo-router/tests/integration/telemetry/verifier.rs @@ -18,6 +18,9 @@ pub trait Verifier { query: Query, ) -> Result<(), BoxError> { let (id, response) = router.execute_query(query).await; + if let Some(spec_id) = self.spec().trace_id { + assert_eq!(id.to_string(), spec_id, "trace id"); + } for _ in 0..20 { if self.find_valid_trace(id).await.is_ok() { break; From 55ec86e4d875a19caf4ac5ee5726425836c2ca70 Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 13 Dec 2024 21:45:46 +0000 Subject: [PATCH 24/26] Add test and fix datadog propagator to fall back to the span sampling if psr is not in context.. --- .../telemetry/tracing/datadog_exporter/mod.rs | 8 ++++- .../tests/integration/telemetry/datadog.rs | 30 +++++++++++++++++++ ...adog_agent_sampling_disabled_1.router.yaml | 22 ++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled_1.router.yaml diff --git a/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/mod.rs b/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/mod.rs index 74907ee6a4..c8ee8c4425 100644 --- a/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/mod.rs +++ b/apollo-router/src/plugins/telemetry/tracing/datadog_exporter/mod.rs @@ -450,7 +450,13 @@ pub(crate) mod propagator { let sampling_priority = span_context .trace_state() .sampling_priority() - .unwrap_or_default(); + .unwrap_or_else(|| { + if span_context.is_sampled() { + SamplingPriority::AutoKeep + } else { + SamplingPriority::AutoReject + } + }); injector.set( DATADOG_SAMPLING_PRIORITY_HEADER, (sampling_priority as i32).to_string(), diff --git a/apollo-router/tests/integration/telemetry/datadog.rs b/apollo-router/tests/integration/telemetry/datadog.rs index b0427ec691..95d6117092 100644 --- a/apollo-router/tests/integration/telemetry/datadog.rs +++ b/apollo-router/tests/integration/telemetry/datadog.rs @@ -70,6 +70,36 @@ async fn test_sampling_datadog_agent_disabled() -> Result<(), BoxError> { Ok(()) } + + +// We want to check we're able to override the behavior of preview_datadog_agent_sampling configuration even if we set a datadog exporter +#[tokio::test(flavor = "multi_thread")] +async fn test_sampling_datadog_agent_disabled_always_sample() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!( + "fixtures/datadog_agent_sampling_disabled_1.router.yaml" + )) + .build() + .await; + + router.start().await; + router.assert_started().await; + + TraceSpec::builder() + .services(["router", "subgraph"].into()) + .subgraph_sampled(true) + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) + .await?; + router.graceful_shutdown().await; + + Ok(()) +} + #[tokio::test(flavor = "multi_thread")] async fn test_priority_sampling_propagated() -> Result<(), BoxError> { if !graph_os_enabled() { diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled_1.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled_1.router.yaml new file mode 100644 index 0000000000..2334508de4 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled_1.router.yaml @@ -0,0 +1,22 @@ +telemetry: + apollo: + field_level_instrumentation_sampler: always_off + exporters: + tracing: + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + sampler: 1.0 + preview_datadog_agent_sampling: false + datadog: + enabled: true + batch_processor: + scheduled_delay: 100ms + fixed_span_names: false + enable_span_mapping: false + instrumentation: + spans: + mode: spec_compliant + From 3b5d256251128c72d85b5473bf64d34a39d464c3 Mon Sep 17 00:00:00 2001 From: bryn Date: Fri, 13 Dec 2024 21:51:42 +0000 Subject: [PATCH 25/26] Add test for non datadog agent propagation zero percent sampling. --- .../tests/integration/telemetry/datadog.rs | 49 +++++++++++++++++-- ...adog_agent_sampling_disabled_0.router.yaml | 22 +++++++++ 2 files changed, 68 insertions(+), 3 deletions(-) create mode 100644 apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled_0.router.yaml diff --git a/apollo-router/tests/integration/telemetry/datadog.rs b/apollo-router/tests/integration/telemetry/datadog.rs index 95d6117092..2a58132927 100644 --- a/apollo-router/tests/integration/telemetry/datadog.rs +++ b/apollo-router/tests/integration/telemetry/datadog.rs @@ -70,8 +70,6 @@ async fn test_sampling_datadog_agent_disabled() -> Result<(), BoxError> { Ok(()) } - - // We want to check we're able to override the behavior of preview_datadog_agent_sampling configuration even if we set a datadog exporter #[tokio::test(flavor = "multi_thread")] async fn test_sampling_datadog_agent_disabled_always_sample() -> Result<(), BoxError> { @@ -92,9 +90,53 @@ async fn test_sampling_datadog_agent_disabled_always_sample() -> Result<(), BoxE TraceSpec::builder() .services(["router", "subgraph"].into()) .subgraph_sampled(true) + .priority_sampled("1") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) + .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .subgraph_sampled(true) + .priority_sampled("1") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) + .await?; + router.graceful_shutdown().await; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_sampling_datadog_agent_disabled_never_sample() -> Result<(), BoxError> { + if !graph_os_enabled() { + return Ok(()); + } + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!( + "fixtures/datadog_agent_sampling_disabled_0.router.yaml" + )) + .build() + .await; + + router.start().await; + router.assert_started().await; + + TraceSpec::builder() + .services([].into()) + .subgraph_sampled(false) .build() .validate_datadog_trace(&mut router, Query::builder().traced(false).build()) .await?; + + TraceSpec::builder() + .services(["client", "router", "subgraph"].into()) + .subgraph_sampled(true) + .priority_sampled("1") + .build() + .validate_datadog_trace(&mut router, Query::builder().traced(true).build()) + .await?; router.graceful_shutdown().await; Ok(()) @@ -911,7 +953,8 @@ impl Verifier for DatadogTraceSpec { .as_f64() .expect("psr not string") .to_string(), - psr + psr, + "psr mismatch" ); } } diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled_0.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled_0.router.yaml new file mode 100644 index 0000000000..42f56dd642 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_agent_sampling_disabled_0.router.yaml @@ -0,0 +1,22 @@ +telemetry: + apollo: + field_level_instrumentation_sampler: always_off + exporters: + tracing: + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + common: + service_name: router + sampler: 0.0 + preview_datadog_agent_sampling: false + datadog: + enabled: true + batch_processor: + scheduled_delay: 100ms + fixed_span_names: false + enable_span_mapping: false + instrumentation: + spans: + mode: spec_compliant + From 0632ee6e6881919d9885e95483fce6b11474a148 Mon Sep 17 00:00:00 2001 From: bryn Date: Sun, 15 Dec 2024 22:38:29 +0000 Subject: [PATCH 26/26] Improve trace_id test --- .../tests/integration/telemetry/datadog.rs | 80 ++++++++++++++++--- .../telemetry/fixtures/datadog.router.yaml | 3 - .../datadog_no_parent_sampler.router.yaml | 3 - ...adog_parent_sampler_very_small.router.yaml | 5 +- ...t_sampler_very_small_no_parent.router.yaml | 25 ++++++ .../tests/integration/telemetry/mod.rs | 17 +++- .../tests/integration/telemetry/otlp.rs | 11 +-- .../tests/integration/telemetry/verifier.rs | 4 +- 8 files changed, 112 insertions(+), 36 deletions(-) create mode 100644 apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small_no_parent.router.yaml diff --git a/apollo-router/tests/integration/telemetry/datadog.rs b/apollo-router/tests/integration/telemetry/datadog.rs index 2a58132927..db33307ae9 100644 --- a/apollo-router/tests/integration/telemetry/datadog.rs +++ b/apollo-router/tests/integration/telemetry/datadog.rs @@ -12,6 +12,7 @@ use crate::integration::common::graph_os_enabled; use crate::integration::common::Query; use crate::integration::common::Telemetry; use crate::integration::telemetry::verifier::Verifier; +use crate::integration::telemetry::DatadogId; use crate::integration::telemetry::TraceSpec; use crate::integration::IntegrationTest; use crate::integration::ValueExt; @@ -357,6 +358,69 @@ async fn test_priority_sampling_parent_sampler_very_small() -> Result<(), BoxErr Ok(()) } +#[tokio::test(flavor = "multi_thread")] +async fn test_priority_sampling_parent_sampler_very_small_no_parent() -> Result<(), BoxError> { + // Note that there is a very small chance this test will fail. We are trying to test a non-zero sampler. + + if !graph_os_enabled() { + return Ok(()); + } + let mut router = IntegrationTest::builder() + .telemetry(Telemetry::Datadog) + .config(include_str!( + "fixtures/datadog_parent_sampler_very_small_no_parent.router.yaml" + )) + .build() + .await; + + router.start().await; + router.assert_started().await; + + // // The router should respect upstream but also almost never sample if left to its own devices. + TraceSpec::builder() + .services(["client", "router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().psr("-1").traced(true).build()) + .await?; + TraceSpec::builder() + .services(["client", "router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().psr("0").traced(true).build()) + .await?; + + TraceSpec::builder() + .services(["client", "router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().psr("1").traced(true).build()) + .await?; + + TraceSpec::builder() + .services(["client", "router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().psr("2").traced(true).build()) + .await?; + + TraceSpec::builder() + .services(["router"].into()) + .priority_sampled("0") + .subgraph_sampled(false) + .build() + .validate_datadog_trace(&mut router, Query::builder().psr("2").traced(false).build()) + .await?; + + router.graceful_shutdown().await; + + Ok(()) +} + #[tokio::test(flavor = "multi_thread")] async fn test_untraced_request() -> Result<(), BoxError> { if !graph_os_enabled() { @@ -524,17 +588,19 @@ async fn test_header_propagator_override() -> Result<(), BoxError> { .build() .await; + let trace_id = opentelemetry::trace::TraceId::from_u128(uuid::Uuid::new_v4().as_u128()); + router.start().await; router.assert_started().await; TraceSpec::builder() .services(["router", "subgraph"].into()) .subgraph_sampled(true) - .trace_id("00000000000000000000000000000001") + .trace_id(format!("{:032x}", trace_id.to_datadog())) .build() .validate_datadog_trace( &mut router, Query::builder() - .header("trace-id", "00000000000000000000000000000001") + .header("trace-id", trace_id.to_string()) .header("x-datadog-trace-id", "2") .traced(false) .build(), @@ -779,16 +845,6 @@ async fn test_span_metrics() -> Result<(), BoxError> { Ok(()) } -pub(crate) trait DatadogId { - fn to_datadog(&self) -> String; -} -impl DatadogId for TraceId { - fn to_datadog(&self) -> String { - let bytes = &self.to_bytes()[std::mem::size_of::()..std::mem::size_of::()]; - u64::from_be_bytes(bytes.try_into().unwrap()).to_string() - } -} - struct DatadogTraceSpec { trace_spec: TraceSpec, } diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog.router.yaml index 0f0f50dd78..c1c4b2096e 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog.router.yaml @@ -5,9 +5,6 @@ telemetry: enabled: true header_name: apollo-custom-trace-id format: datadog - propagation: - trace_context: true - jaeger: true common: service_name: router resource: diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_no_parent_sampler.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_no_parent_sampler.router.yaml index 2e9c634dd9..c6ec7c22b7 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog_no_parent_sampler.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_no_parent_sampler.router.yaml @@ -5,9 +5,6 @@ telemetry: enabled: true header_name: apollo-custom-trace-id format: datadog - propagation: - trace_context: true - jaeger: true common: service_name: router parent_based_sampler: false diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small.router.yaml index 90a5594503..206e72d1b1 100644 --- a/apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small.router.yaml +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small.router.yaml @@ -5,12 +5,9 @@ telemetry: enabled: true header_name: apollo-custom-trace-id format: datadog - propagation: - trace_context: true - jaeger: true common: service_name: router - sampler: 0.000000001 + sampler: 0.00001 parent_based_sampler: true resource: env: local1 diff --git a/apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small_no_parent.router.yaml b/apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small_no_parent.router.yaml new file mode 100644 index 0000000000..658b7d2361 --- /dev/null +++ b/apollo-router/tests/integration/telemetry/fixtures/datadog_parent_sampler_very_small_no_parent.router.yaml @@ -0,0 +1,25 @@ +telemetry: + exporters: + tracing: + experimental_response_trace_id: + enabled: true + header_name: apollo-custom-trace-id + format: datadog + common: + service_name: router + sampler: 0.00001 + parent_based_sampler: false + resource: + env: local1 + service.version: router_version_override + preview_datadog_agent_sampling: true + datadog: + enabled: true + batch_processor: + scheduled_delay: 100ms + instrumentation: + spans: + mode: spec_compliant + supergraph: + attributes: + graphql.operation.name: true diff --git a/apollo-router/tests/integration/telemetry/mod.rs b/apollo-router/tests/integration/telemetry/mod.rs index 4edf023702..6319182e62 100644 --- a/apollo-router/tests/integration/telemetry/mod.rs +++ b/apollo-router/tests/integration/telemetry/mod.rs @@ -1,6 +1,8 @@ use std::collections::HashMap; use std::collections::HashSet; +use opentelemetry_api::trace::TraceId; + #[cfg(any(not(feature = "ci"), all(target_arch = "x86_64", target_os = "linux")))] mod datadog; #[cfg(any(not(feature = "ci"), all(target_arch = "x86_64", target_os = "linux")))] @@ -22,7 +24,7 @@ struct TraceSpec { unmeasured_spans: HashSet<&'static str>, priority_sampled: Option<&'static str>, subgraph_sampled: Option, - trace_id: Option<&'static str>, + trace_id: Option, span_attributes: HashMap<&'static str, Vec<(&'static str, &'static str)>>, } @@ -39,7 +41,7 @@ impl TraceSpec { unmeasured_spans: HashSet<&'static str>, priority_sampled: Option<&'static str>, subgraph_sampled: Option, - trace_id: Option<&'static str>, + trace_id: Option, span_attributes: HashMap<&'static str, Vec<(&'static str, &'static str)>>, ) -> Self { Self { @@ -56,3 +58,14 @@ impl TraceSpec { } } } + +#[allow(dead_code)] +pub trait DatadogId { + fn to_datadog(&self) -> u64; +} +impl DatadogId for TraceId { + fn to_datadog(&self) -> u64 { + let bytes = &self.to_bytes()[std::mem::size_of::()..std::mem::size_of::()]; + u64::from_be_bytes(bytes.try_into().unwrap()) + } +} diff --git a/apollo-router/tests/integration/telemetry/otlp.rs b/apollo-router/tests/integration/telemetry/otlp.rs index 02a7ce3093..af73bc32e8 100644 --- a/apollo-router/tests/integration/telemetry/otlp.rs +++ b/apollo-router/tests/integration/telemetry/otlp.rs @@ -20,6 +20,7 @@ use crate::integration::common::graph_os_enabled; use crate::integration::common::Query; use crate::integration::common::Telemetry; use crate::integration::telemetry::verifier::Verifier; +use crate::integration::telemetry::DatadogId; use crate::integration::telemetry::TraceSpec; use crate::integration::IntegrationTest; use crate::integration::ValueExt; @@ -769,16 +770,6 @@ async fn mock_otlp_server() -> MockServer { mock_server } -pub(crate) trait DatadogId { - fn to_datadog(&self) -> u64; -} -impl DatadogId for TraceId { - fn to_datadog(&self) -> u64 { - let bytes = &self.to_bytes()[std::mem::size_of::()..std::mem::size_of::()]; - u64::from_be_bytes(bytes.try_into().unwrap()) - } -} - impl TraceSpec { async fn validate_otlp_trace( self, diff --git a/apollo-router/tests/integration/telemetry/verifier.rs b/apollo-router/tests/integration/telemetry/verifier.rs index c6f92cddef..3fe9fdabbd 100644 --- a/apollo-router/tests/integration/telemetry/verifier.rs +++ b/apollo-router/tests/integration/telemetry/verifier.rs @@ -18,8 +18,8 @@ pub trait Verifier { query: Query, ) -> Result<(), BoxError> { let (id, response) = router.execute_query(query).await; - if let Some(spec_id) = self.spec().trace_id { - assert_eq!(id.to_string(), spec_id, "trace id"); + if let Some(spec_id) = &self.spec().trace_id { + assert_eq!(id.to_string(), *spec_id, "trace id"); } for _ in 0..20 { if self.find_valid_trace(id).await.is_ok() {