Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integration Exception Tracking #11732

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions ddtrace/internal/logger.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import collections
import logging
import os
import traceback
import typing
from typing import Optional # noqa:F401
from typing import cast # noqa:F401
Expand Down Expand Up @@ -120,6 +121,14 @@ def __init__(self, *args, **kwargs):
else:
self.rate_limit = 60

self.telemetry_log_buckets = None
if _TelemetryConfig.LOG_COLLECTION_ENABLED:
if self.name.startswith("ddtrace.contrib."):
# Collect only errors logged within the integration package
self.telemetry_log_buckets = collections.defaultdict(
lambda: DDLogger.LoggingBucket(0, 0)
) # type: DefaultDict[Tuple[str, int, str, int], DDLogger.LoggingBucket]

def handle(self, record):
# type: (logging.LogRecord) -> None
"""
Expand All @@ -142,6 +151,9 @@ def handle(self, record):
full_file_name = os.path.join(record.pathname, record.filename)
telemetry.telemetry_writer.add_error(1, record.msg % record.args, full_file_name, record.lineno)

if self.telemetry_log_buckets is not None:
self._report_telemetry_log(record)

# If rate limiting has been disabled (`DD_TRACE_LOGGING_RATE=0`) then apply no rate limit
# If the logging is in debug, then do not apply any limits to any log
if not self.rate_limit or self.getEffectiveLevel() == logging.DEBUG:
Expand Down Expand Up @@ -178,3 +190,47 @@ def handle(self, record):
# Increment the count of records we have skipped
# DEV: `self.buckets[key]` is a tuple which is immutable so recreate instead
self.buckets[key] = DDLogger.LoggingBucket(logging_bucket.bucket, logging_bucket.skipped + 1)

def _report_telemetry_log(self, record):
# type: (logging.LogRecord) -> None
from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL

key = (record.name, record.levelno, record.pathname, record.lineno)
current_bucket = int(record.created / _TelemetryConfig.TELEMETRY_HEARTBEAT_INTERVAL)
key_bucket = self.telemetry_log_buckets[key]
if key_bucket.bucket == current_bucket:
self.telemetry_log_buckets[key] = DDLogger.LoggingBucket(key_bucket.bucket, key_bucket.skipped + 1)
else:
self.telemetry_log_buckets[key] = DDLogger.LoggingBucket(current_bucket, 0)
level = (
TELEMETRY_LOG_LEVEL.ERROR
if record.levelno >= logging.ERROR
else TELEMETRY_LOG_LEVEL.WARNING
if record.levelno == logging.WARNING
else TELEMETRY_LOG_LEVEL.DEBUG
)
from ddtrace.internal import telemetry

tags = {
"lib_language": "python",
}
stack_trace = None
if record.exc_info:
_, _, traceback_object = record.exc_info
if traceback_object:
stack_trace = "".join(traceback.format_tb(traceback_object))
# TODO redact absolute file paths and unknown packages
if record.levelno >= logging.ERROR or stack_trace is not None:
# Report only an error or an exception with a stack trace
telemetry.telemetry_writer.add_log(
level, record.msg, tags=tags, stack_trace=stack_trace, count=key_bucket.skipped + 1
)


class _TelemetryConfig:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like we are introducing telemetry-specific logic into a logging source. Can we try to see if there is a different design that allows keeping the two separate, please?

Copy link
Author

@ygree ygree Jan 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really "introducing", since some of this was already there to capture errors, and this change just extends it to exception tracking.
Alternatively, we would have to duplicate all the logging calls in the contib modules just to have exception tracking, which is easy to forget to add, and just introduces code duplication in the instrumentation code.

I'll consider adding a separate telemetry logger if you think that's a better solution. It will probably need to be in the same package, because my attempt to put it in a telemetry package ended with

ImportError: cannot import name 'get_logger' from partially initialized module 'ddtrace.internal.logger' (most likely due to circular import)

TELEMETRY_ENABLED = os.getenv("DD_INSTRUMENTATION_TELEMETRY_ENABLED", "true").lower() in ("true", "1")
LOG_COLLECTION_ENABLED = TELEMETRY_ENABLED and os.getenv("DD_TELEMETRY_LOG_COLLECTION_ENABLED", "true").lower() in (
"true",
"1",
)
TELEMETRY_HEARTBEAT_INTERVAL = float(os.getenv("DD_TELEMETRY_HEARTBEAT_INTERVAL", "60"))
6 changes: 4 additions & 2 deletions ddtrace/internal/telemetry/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,8 +478,8 @@ def add_configurations(self, configuration_list):
"value": value,
}

def add_log(self, level, message, stack_trace="", tags=None):
# type: (TELEMETRY_LOG_LEVEL, str, str, Optional[Dict]) -> None
def add_log(self, level, message, stack_trace="", tags=None, count=1):
# type: (TELEMETRY_LOG_LEVEL, str, str, Optional[Dict], int) -> None
"""
Queues log. This event is meant to send library logs to Datadog’s backend through the Telemetry intake.
This will make support cycles easier and ensure we know about potentially silent issues in libraries.
Expand All @@ -499,6 +499,8 @@ def add_log(self, level, message, stack_trace="", tags=None):
data["tags"] = ",".join(["%s:%s" % (k, str(v).lower()) for k, v in tags.items()])
if stack_trace:
data["stack_trace"] = stack_trace
if count > 1:
data["count"] = count
self._logs.add(data)

def add_gauge_metric(self, namespace, name, value, tags=None):
Expand Down
Loading