Skip to content

Correlate traces to profiles with the OTel ebpf profiler #197

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ cc_library(
"include/datadog/span_matcher.h",
"include/datadog/span_sampler_config.h",
"include/datadog/string_view.h",
"include/datadog/tls_storage.h",
"include/datadog/tracer.h",
"include/datadog/tracer_config.h",
"include/datadog/tracer_signature.h",
Expand Down
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,14 @@ options is not needed. The `-ldd_trace_cpp` option is always needed.
c++ -o my_app my_app.o -L/path/to/dd-trace-cpp/.install/lib -ldd_trace_cpp
```

### Optional: Trace to Profile correlation
In order to correlate traces to profiles generated by the full host profiler[[1](https://github.com/DataDog/opentelemetry-ebpf-profiler), [2](https://github.com/DataDog/dd-otel-host-profiler)],
there are a couple of requirements for the build:

[//]: # (TODO: add a link to the specification)
- The app must be built on linux since the profiler leverages eBPF, a Linux kernel feature.
- The compiler used for the build must support the `-mlts-dialect={gnu2/desc}` flag (GCC-11+ and Clang-19+).

Test
----
Pass `-DDD_TRACE_BUILD_TESTING=1` to `cmake` to include the unit tests in the build.
Expand Down
20 changes: 20 additions & 0 deletions cmake/compiler/clang.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,26 @@
add_library(dd_trace_cpp-specs INTERFACE)
add_library(dd_trace::specs ALIAS dd_trace_cpp-specs)

if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
set(TLS_DIALECT desc)
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64")
set(TLS_DIALECT gnu2)
else()
message(FATAL_ERROR "Only aarch64 and x86-64 are supported (found: ${CMAKE_SYSTEM_PROCESSOR})")
endif()

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part could be conditioned to an expected $TLS_DIALECT set above

include(CheckCompilerFlag)
check_compiler_flag(CXX "-mtls-dialect=${TLS_DIALECT}" TLS_DIALECT_OK)
if (TLS_DIALECT_OK)
target_compile_options(dd_trace_cpp-specs INTERFACE
-fPIC
-ftls-model=global-dynamic
-mtls-dialect=${TLS_DIALECT}
)
endif()
endif()

target_compile_options(dd_trace_cpp-specs
INTERFACE
-Wall
Expand Down
20 changes: 20 additions & 0 deletions cmake/compiler/gcc.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,26 @@
add_library(dd_trace_cpp-specs INTERFACE)
add_library(dd_trace::specs ALIAS dd_trace_cpp-specs)

if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
set(TLS_DIALECT desc)
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64")
set(TLS_DIALECT gnu2)
else()
message(FATAL_ERROR "Only aarch64 and x86-64 are supported (found: ${CMAKE_SYSTEM_PROCESSOR})")
endif()

include(CheckCompilerFlag)
check_compiler_flag(CXX "-mtls-dialect=${TLS_DIALECT}" TLS_DIALECT_OK)
if (TLS_DIALECT_OK)
target_compile_options(dd_trace_cpp-specs INTERFACE
-fPIC
-ftls-model=global-dynamic
-mtls-dialect=${TLS_DIALECT}
)
endif()
endif()

target_compile_options(dd_trace_cpp-specs
INTERFACE
-Wall
Expand Down
1 change: 1 addition & 0 deletions include/datadog/environment.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ namespace environment {
MACRO(DD_TRACE_TAGS_PROPAGATION_MAX_LENGTH) \
MACRO(DD_VERSION) \
MACRO(DD_TRACE_128_BIT_TRACEID_GENERATION_ENABLED) \
MACRO(DD_TRACE_CORRELATE_FULL_HOST_PROFILES) \
MACRO(DD_TELEMETRY_HEARTBEAT_INTERVAL) \
MACRO(DD_TELEMETRY_METRICS_ENABLED) \
MACRO(DD_TELEMETRY_METRICS_INTERVAL_SECONDS) \
Expand Down
25 changes: 25 additions & 0 deletions include/datadog/tls_storage.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#pragma once

#include <datadog/trace_id.h>

#include <array>
#include <cstdint>

// Global struct used to exposed thread-specific information.
// https://github.com/elastic/apm/blob/149cd3e39a77a58002344270ed2ad35357bdd02d/specs/agents/universal-profiling-integration.md#thread-local-storage-layout

namespace datadog {
namespace tracing {
struct __attribute__((packed)) TLSStorage {
uint16_t layout_minor_version;
uint8_t valid;
uint8_t trace_present;
uint8_t trace_flags;
uint64_t trace_id_low;
uint64_t trace_id_high;
uint64_t span_id;
uint64_t transaction_id;
};

} // namespace tracing
} // namespace datadog
2 changes: 2 additions & 0 deletions include/datadog/trace_segment.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ class TraceSegment {
const Optional<std::string>& origin() const;
Optional<SamplingDecision> sampling_decision() const;

uint64_t local_root_id() const;

Logger& logger() const;

// Inject trace context for the specified `span` into the specified `writer`.
Expand Down
14 changes: 14 additions & 0 deletions include/datadog/tracer.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
// obtained from a `TracerConfig` via the `finalize_config` function. See
// `tracer_config.h`.

#ifdef __linux__
#include <datadog/tls_storage.h>
#endif

#include <cstddef>
#include <memory>

Expand All @@ -23,6 +27,12 @@
#include "tracer_config.h"
#include "tracer_signature.h"

#ifdef __linux__
extern const void* elastic_apm_profiling_correlation_process_storage_v1;
extern thread_local struct datadog::tracing::TLSStorage*
elastic_apm_profiling_correlation_tls_v1;
#endif

namespace datadog {
namespace tracing {

Expand Down Expand Up @@ -54,6 +64,7 @@ class Tracer {
Baggage::Options baggage_opts_;
bool baggage_injection_enabled_;
bool baggage_extraction_enabled_;
bool correlate_full_host_profiles_;

public:
// Create a tracer configured using the specified `config`, and optionally:
Expand Down Expand Up @@ -105,6 +116,9 @@ class Tracer {
std::string config() const;

private:
#ifdef __linux__
void correlate(const Span& span);
#endif
void store_config();
};

Expand Down
9 changes: 9 additions & 0 deletions include/datadog/tracer_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,14 @@ struct TracerConfig {
// the `DD_TRACE_128_BIT_TRACEID_GENERATION_ENABLED` environment variable.
Optional<bool> generate_128bit_trace_ids;

// `correlate_full_host_profiles` indicates whether we want to correlate
// traces and spans with profiles generated by the eBPF full host profiler.
// This correlation only works on linux, due to the eBPF-based nature of
// the profiler. It implies writing some process-level and thread-level
// data in variables which the profiler will then read from the process's
// memory.
Optional<bool> correlate_full_host_profiles;

// `runtime_id` denotes the current run of the application in which the tracer
// is embedded. If `runtime_id` is not specified, then it defaults to a
// pseudo-randomly generated value. A server that contains multiple tracers,
Expand Down Expand Up @@ -197,6 +205,7 @@ class FinalizedTracerConfig final {
std::shared_ptr<Logger> logger;
bool log_on_startup;
bool generate_128bit_trace_ids;
bool correlate_full_host_profiles;
Optional<RuntimeID> runtime_id;
Clock clock;
std::string integration_name;
Expand Down
43 changes: 43 additions & 0 deletions include/datadog/tracer_signature.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
// polling the Datadog Agent. See
// `RemoteConfigurationManager::process_response` in `remote_config.h`.

#ifdef __linux__
#include <cstring>
#include <memory>
#include <vector>
#endif

#include <string>

#include "runtime_id.h"
Expand All @@ -31,6 +37,17 @@
namespace datadog {
namespace tracing {

#ifdef __linux__
namespace {
void write_utf8_string(std::vector<uint8_t>& buffer, const std::string& str) {
uint32_t length = str.length();
buffer.insert(buffer.end(), reinterpret_cast<uint8_t*>(&length),
reinterpret_cast<uint8_t*>(&length) + sizeof(length));
buffer.insert(buffer.end(), str.begin(), str.end());
}
} // namespace
#endif

struct TracerSignature {
RuntimeID runtime_id;
std::string default_service;
Expand All @@ -47,6 +64,32 @@ struct TracerSignature {
library_version(tracer_version),
library_language("cpp"),
library_language_version(DD_TRACE_STRINGIFY(__cplusplus), 6) {}

#ifdef __linux__
// The process correlation storage contains information needed to
// correlate traces to profiles generated by dd-otel-host-profiler.
const std::unique_ptr<uint8_t*> generate_process_correlation_storage() {
std::vector<uint8_t> buffer;

// Currently, layout minor version is 2 to differ from Elastic's
// version which includes a socket path.
// Layout:
// https://github.com/elastic/apm/blob/149cd3e39a77a58002344270ed2ad35357bdd02d/specs/agents/universal-profiling-integration.md#process-storage-layout
uint16_t layout_minor_version = 2;
buffer.insert(buffer.end(),
reinterpret_cast<uint8_t*>(&layout_minor_version),
reinterpret_cast<uint8_t*>(&layout_minor_version) +
sizeof(layout_minor_version));

write_utf8_string(buffer, default_service);
write_utf8_string(buffer, default_environment);
write_utf8_string(buffer, runtime_id.string());

uint8_t* res = new uint8_t[buffer.size()];
memcpy(res, buffer.data(), buffer.size());
return std::make_unique<uint8_t*>(res);
}
#endif
};

} // namespace tracing
Expand Down
7 changes: 5 additions & 2 deletions src/datadog/datadog_agent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,11 @@ DatadogAgent::~DatadogAgent() {
Expected<void> DatadogAgent::send(
std::vector<std::unique_ptr<SpanData>>&& spans,
const std::shared_ptr<TraceSampler>& response_handler) {
std::lock_guard<std::mutex> lock(mutex_);
trace_chunks_.push_back(TraceChunk{std::move(spans), response_handler});
{
std::lock_guard<std::mutex> lock(mutex_);
trace_chunks_.push_back(TraceChunk{std::move(spans), response_handler});
}
flush();
return nullopt;
}

Expand Down
11 changes: 11 additions & 0 deletions src/datadog/span.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <datadog/span_config.h>
#include <datadog/string_view.h>
#include <datadog/trace_segment.h>
#include <datadog/tracer.h>

#include <cassert>
#include <string>
Expand Down Expand Up @@ -40,6 +41,16 @@ Span::~Span() {
data_->duration = now - data_->start;
}

#ifdef __linux__
// When a span is finished, we must update the span_id to its parent's.
if (elastic_apm_profiling_correlation_process_storage_v1 != nullptr &&
parent_id().has_value()) {
elastic_apm_profiling_correlation_tls_v1->valid = 0;
elastic_apm_profiling_correlation_tls_v1->span_id = parent_id().value();
elastic_apm_profiling_correlation_tls_v1->valid = 1;
Comment on lines +48 to +50
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For future reference, if anyone's curious, I was reviewing this with @elsakeirouz and we noticed we're missing some synchronization here, per the spec:

Note that APM-agents must make sure that compilers do not reorder the steps listed above.

}
#endif

trace_segment_->span_finished();
}

Expand Down
9 changes: 9 additions & 0 deletions src/datadog/trace_segment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <datadog/telemetry/metrics.h>
#include <datadog/telemetry/telemetry.h>
#include <datadog/trace_segment.h>
#include <datadog/tracer.h>

#include <cassert>
#include <string>
Expand Down Expand Up @@ -137,6 +138,8 @@ Optional<SamplingDecision> TraceSegment::sampling_decision() const {
return sampling_decision_;
}

uint64_t TraceSegment::local_root_id() const { return spans_.front()->span_id; }

Logger& TraceSegment::logger() const { return *logger_; }

void TraceSegment::register_span(std::unique_ptr<SpanData> span) {
Expand Down Expand Up @@ -255,6 +258,12 @@ void TraceSegment::span_finished() {
}

telemetry::counter::increment(metrics::tracer::trace_segments_closed);

#ifdef __linux__
// When all spans are finished, so is the current trace.
if (elastic_apm_profiling_correlation_process_storage_v1 != nullptr)
elastic_apm_profiling_correlation_tls_v1->trace_present = 0;
#endif
}

void TraceSegment::override_sampling_priority(SamplingPriority priority) {
Expand Down
Loading