diff --git a/charmcraft.yaml b/charmcraft.yaml index 481f8d1..ecdd7fa 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -32,6 +32,10 @@ parts: - pkg-config - rustc - cargo + charm-binary-python-packages: + # Install PyYAML from binary and avoid building it from sources. This way, we can use PyYAML with C-optimized lib. + # With the C-optimized lib, serialization in ops is 20x faster. + - PyYAML cos-tool: plugin: dump source: . diff --git a/lib/charms/observability_libs/v1/cert_handler.py b/lib/charms/observability_libs/v1/cert_handler.py index 4a1940b..26be879 100644 --- a/lib/charms/observability_libs/v1/cert_handler.py +++ b/lib/charms/observability_libs/v1/cert_handler.py @@ -32,6 +32,7 @@ Since this library uses [Juju Secrets](https://juju.is/docs/juju/secret) it requires Juju >= 3.0.3. """ import abc +import hashlib import ipaddress import json import socket @@ -67,7 +68,7 @@ LIBID = "b5cd5cd580f3428fa5f59a8876dcbe6a" LIBAPI = 1 -LIBPATCH = 13 +LIBPATCH = 14 VAULT_SECRET_LABEL = "cert-handler-private-vault" @@ -301,14 +302,11 @@ def __init__( Must match metadata.yaml. cert_subject: Custom subject. Name collisions are under the caller's responsibility. sans: DNS names. If none are given, use FQDN. - refresh_events: an optional list of bound events which - will be observed to replace the current CSR with a new one - if there are changes in the CSR's DNS SANs or IP SANs. - Then, subsequently, replace its corresponding certificate with a new one. + refresh_events: [DEPRECATED]. """ super().__init__(charm, key) # use StoredState to store the hash of the CSR - # to potentially trigger a CSR renewal on `refresh_events` + # to potentially trigger a CSR renewal self._stored.set_default( csr_hash=None, ) @@ -320,8 +318,9 @@ def __init__( # Use fqdn only if no SANs were given, and drop empty/duplicate SANs sans = list(set(filter(None, (sans or [socket.getfqdn()])))) - self.sans_ip = list(filter(is_ip_address, sans)) - self.sans_dns = list(filterfalse(is_ip_address, sans)) + # sort SANS lists to avoid unnecessary csr renewals during reconciliation + self.sans_ip = sorted(filter(is_ip_address, sans)) + self.sans_dns = sorted(filterfalse(is_ip_address, sans)) if self._check_juju_supports_secrets(): vault_backend = _SecretVaultBackend(charm, secret_label=VAULT_SECRET_LABEL) @@ -367,13 +366,15 @@ def __init__( ) if refresh_events: - for ev in refresh_events: - self.framework.observe(ev, self._on_refresh_event) + logger.warn( + "DEPRECATION WARNING. `refresh_events` is now deprecated. CertHandler will automatically refresh the CSR when necessary." + ) - def _on_refresh_event(self, _): - """Replace the latest current CSR with a new one if there are any SANs changes.""" - if self._stored.csr_hash != self._csr_hash: - self._generate_csr(renew=True) + self._reconcile() + + def _reconcile(self): + """Run all logic that is independent of what event we're processing.""" + self._refresh_csr_if_needed() def _on_upgrade_charm(self, _): has_privkey = self.vault.get_value("private-key") @@ -388,6 +389,11 @@ def _on_upgrade_charm(self, _): # this will call `self.private_key` which will generate a new privkey. self._generate_csr(renew=True) + def _refresh_csr_if_needed(self): + """Refresh the current CSR with a new one if there are any SANs changes.""" + if self._stored.csr_hash is not None and self._stored.csr_hash != self._csr_hash: + self._generate_csr(renew=True) + def _migrate_vault(self): peer_backend = _RelationVaultBackend(self.charm, relation_name="peers") @@ -440,13 +446,17 @@ def enabled(self) -> bool: return True @property - def _csr_hash(self) -> int: + def _csr_hash(self) -> str: """A hash of the config that constructs the CSR. Only include here the config options that, should they change, should trigger a renewal of the CSR. """ - return hash( + + def _stable_hash(data): + return hashlib.sha256(str(data).encode()).hexdigest() + + return _stable_hash( ( tuple(self.sans_dns), tuple(self.sans_ip), diff --git a/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py b/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py index 1e7ff84..cf8def1 100644 --- a/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py +++ b/lib/charms/tempo_coordinator_k8s/v0/charm_tracing.py @@ -69,6 +69,9 @@ def my_tracing_endpoint(self) -> Optional[str]: - every event as a span (including custom events) - every charm method call (except dunders) as a span +We recommend that you scale up your tracing provider and relate it to an ingress so that your tracing requests +go through the ingress and get load balanced across all units. Otherwise, if the provider's leader goes down, your tracing goes down. + ## TLS support If your charm integrates with a TLS provider which is also trusted by the tracing provider (the Tempo charm), @@ -114,6 +117,57 @@ def get_tracer(self) -> opentelemetry.trace.Tracer: See the official opentelemetry Python SDK documentation for usage: https://opentelemetry-python.readthedocs.io/en/latest/ + +## Caching traces +The `trace_charm` machinery will buffer any traces collected during charm execution and store them +to a file on the charm container until a tracing backend becomes available. At that point, it will +flush them to the tracing receiver. + +By default, the buffer is configured to start dropping old traces if any of these conditions apply: + +- the storage size exceeds 10 MiB +- the number of buffered events exceeds 100 + +You can configure this by, for example: + +```python +@trace_charm( + tracing_endpoint="my_tracing_endpoint", + server_cert="_server_cert", + # only cache up to 42 events + buffer_max_events=42, + # only cache up to 42 MiB + buffer_max_size_mib=42, # minimum 10! +) +class MyCharm(CharmBase): + ... +``` + +Note that setting `buffer_max_events` to 0 will effectively disable the buffer. + +The path of the buffer file is by default in the charm's execution root, which for k8s charms means +that in case of pod churn, the cache will be lost. The recommended solution is to use an existing storage +(or add a new one) such as: + +```yaml +storage: + data: + type: filesystem + location: /charm-traces +``` + +and then configure the `@trace_charm` decorator to use it as path for storing the buffer: +```python +@trace_charm( + tracing_endpoint="my_tracing_endpoint", + server_cert="_server_cert", + # store traces to a PVC so they're not lost on pod restart. + buffer_path="/charm-traces/buffer.file", +) +class MyCharm(CharmBase): + ... +``` + ## Upgrading from `v0` If you are upgrading from `charm_tracing` v0, you need to take the following steps (assuming you already @@ -171,6 +225,12 @@ def my_tracing_endpoint(self) -> Optional[str]: 3) If you were passing a certificate (str) using `server_cert`, you need to change it to provide an *absolute* path to the certificate file instead. """ +import typing + +from opentelemetry.exporter.otlp.proto.common._internal.trace_encoder import ( + encode_spans, +) +from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter def _remove_stale_otel_sdk_packages(): @@ -222,6 +282,9 @@ def _remove_stale_otel_sdk_packages(): otel_logger.debug("Successfully applied _remove_stale_otel_sdk_packages patch. ") +# apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm. +# it could be trouble if someone ever decides to implement their own tracer parallel to +# ours and before the charm has inited. We assume they won't. _remove_stale_otel_sdk_packages() import functools @@ -235,6 +298,7 @@ def _remove_stale_otel_sdk_packages(): Any, Callable, Generator, + List, Optional, Sequence, Type, @@ -247,8 +311,12 @@ def _remove_stale_otel_sdk_packages(): import ops from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.resources import Resource -from opentelemetry.sdk.trace import Span, TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.sdk.trace import ReadableSpan, Span, TracerProvider +from opentelemetry.sdk.trace.export import ( + BatchSpanProcessor, + SpanExporter, + SpanExportResult, +) from opentelemetry.trace import INVALID_SPAN, Tracer from opentelemetry.trace import get_current_span as otlp_get_current_span from opentelemetry.trace import ( @@ -269,7 +337,7 @@ def _remove_stale_otel_sdk_packages(): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 2 +LIBPATCH = 4 PYDEPS = ["opentelemetry-exporter-otlp-proto-http==1.21.0"] @@ -277,7 +345,7 @@ def _remove_stale_otel_sdk_packages(): dev_logger = logging.getLogger("tracing-dev") # set this to 0 if you are debugging/developing this library source -dev_logger.setLevel(logging.CRITICAL) +dev_logger.setLevel(logging.ERROR) _CharmType = Type[CharmBase] # the type CharmBase and any subclass thereof _C = TypeVar("_C", bound=_CharmType) @@ -287,6 +355,186 @@ def _remove_stale_otel_sdk_packages(): _GetterType = Union[Callable[[_CharmType], Optional[str]], property] CHARM_TRACING_ENABLED = "CHARM_TRACING_ENABLED" +BUFFER_DEFAULT_CACHE_FILE_NAME = ".charm_tracing_buffer.raw" +# we store the buffer as raw otlp-native protobuf (bytes) since it's hard to serialize/deserialize it in +# any portable format. Json dumping is supported, but loading isn't. +# cfr: https://github.com/open-telemetry/opentelemetry-python/issues/1003 + +BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB = 10 +_BUFFER_CACHE_FILE_SIZE_LIMIT_MiB_MIN = 10 +BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH = 100 +_MiB_TO_B = 2**20 # megabyte to byte conversion rate +_OTLP_SPAN_EXPORTER_TIMEOUT = 1 +"""Timeout in seconds that the OTLP span exporter has to push traces to the backend.""" + + +class _Buffer: + """Handles buffering for spans emitted while no tracing backend is configured or available. + + Use the max_event_history_length_buffering param of @trace_charm to tune + the amount of memory that this will hog on your units. + + The buffer is formatted as a bespoke byte dump (protobuf limitation). + We cannot store them as json because that is not well-supported by the sdk + (see https://github.com/open-telemetry/opentelemetry-python/issues/3364). + """ + + _SPANSEP = b"__CHARM_TRACING_BUFFER_SPAN_SEP__" + + def __init__(self, db_file: Path, max_event_history_length: int, max_buffer_size_mib: int): + self._db_file = db_file + self._max_event_history_length = max_event_history_length + self._max_buffer_size_mib = max(max_buffer_size_mib, _BUFFER_CACHE_FILE_SIZE_LIMIT_MiB_MIN) + + # set by caller + self.exporter: Optional[OTLPSpanExporter] = None + + def save(self, spans: typing.Sequence[ReadableSpan]): + """Save the spans collected by this exporter to the cache file. + + This method should be as fail-safe as possible. + """ + if self._max_event_history_length < 1: + dev_logger.debug("buffer disabled: max history length < 1") + return + + current_history_length = len(self.load()) + new_history_length = current_history_length + len(spans) + if (diff := self._max_event_history_length - new_history_length) < 0: + self.drop(diff) + self._save(spans) + + def _serialize(self, spans: Sequence[ReadableSpan]) -> bytes: + # encode because otherwise we can't json-dump them + return encode_spans(spans).SerializeToString() + + def _save(self, spans: Sequence[ReadableSpan], replace: bool = False): + dev_logger.debug(f"saving {len(spans)} new spans to buffer") + old = [] if replace else self.load() + new = self._serialize(spans) + + try: + # if the buffer exceeds the size limit, we start dropping old spans until it does + + while len((new + self._SPANSEP.join(old))) > (self._max_buffer_size_mib * _MiB_TO_B): + if not old: + # if we've already dropped all spans and still we can't get under the + # size limit, we can't save this span + logger.error( + f"span exceeds total buffer size limit ({self._max_buffer_size_mib}MiB); " + f"buffering FAILED" + ) + return + + old = old[1:] + logger.warning( + f"buffer size exceeds {self._max_buffer_size_mib}MiB; dropping older spans... " + f"Please increase the buffer size, disable buffering, or ensure the spans can be flushed." + ) + + self._db_file.write_bytes(new + self._SPANSEP.join(old)) + except Exception: + logger.exception("error buffering spans") + + def load(self) -> List[bytes]: + """Load currently buffered spans from the cache file. + + This method should be as fail-safe as possible. + """ + if not self._db_file.exists(): + dev_logger.debug("buffer file not found. buffer empty.") + return [] + try: + spans = self._db_file.read_bytes().split(self._SPANSEP) + except Exception: + logger.exception(f"error parsing {self._db_file}") + return [] + return spans + + def drop(self, n_spans: Optional[int] = None): + """Drop some currently buffered spans from the cache file.""" + current = self.load() + if n_spans: + dev_logger.debug(f"dropping {n_spans} spans from buffer") + new = current[n_spans:] + else: + dev_logger.debug("emptying buffer") + new = [] + + self._db_file.write_bytes(self._SPANSEP.join(new)) + + def flush(self) -> Optional[bool]: + """Export all buffered spans to the given exporter, then clear the buffer. + + Returns whether the flush was successful, and None if there was nothing to flush. + """ + if not self.exporter: + dev_logger.debug("no exporter set; skipping buffer flush") + return False + + buffered_spans = self.load() + if not buffered_spans: + dev_logger.debug("nothing to flush; buffer empty") + return None + + errors = False + for span in buffered_spans: + try: + out = self.exporter._export(span) # type: ignore + if not (200 <= out.status_code < 300): + # take any 2xx status code as a success + errors = True + except ConnectionError: + dev_logger.debug( + "failed exporting buffered span; backend might be down or still starting" + ) + errors = True + except Exception: + logger.exception("unexpected error while flushing span batch from buffer") + errors = True + + if not errors: + self.drop() + else: + logger.error("failed flushing spans; buffer preserved") + return not errors + + @property + def is_empty(self): + """Utility to check whether the buffer has any stored spans. + + This is more efficient than attempting a load() given how large the buffer might be. + """ + return (not self._db_file.exists()) or (self._db_file.stat().st_size == 0) + + +class _OTLPSpanExporter(OTLPSpanExporter): + """Subclass of OTLPSpanExporter to configure the max retry timeout, so that it fails a bit faster.""" + + # The issue we're trying to solve is that the model takes AGES to settle if e.g. tls is misconfigured, + # as every hook of a charm_tracing-instrumented charm takes about a minute to exit, as the charm can't + # flush the traces and keeps retrying for 'too long' + + _MAX_RETRY_TIMEOUT = 4 + # we give the exporter 4 seconds in total to succeed pushing the traces to tempo + # if it fails, we'll be caching the data in the buffer and flush it the next time, so there's no data loss risk. + # this means 2/3 retries (hard to guess from the implementation) and up to ~7 seconds total wait + + +class _BufferedExporter(InMemorySpanExporter): + def __init__(self, buffer: _Buffer) -> None: + super().__init__() + self._buffer = buffer + + def export(self, spans: typing.Sequence[ReadableSpan]) -> SpanExportResult: + self._buffer.save(spans) + return super().export(spans) + + def force_flush(self, timeout_millis: int = 0) -> bool: + # parent implementation is fake, so the timeout_millis arg is not doing anything. + result = super().force_flush(timeout_millis) + self._buffer.save(self.get_finished_spans()) + return result def is_enabled() -> bool: @@ -423,7 +671,10 @@ def _setup_root_span_initializer( charm_type: _CharmType, tracing_endpoint_attr: str, server_cert_attr: Optional[str], - service_name: Optional[str] = None, + service_name: Optional[str], + buffer_path: Optional[Path], + buffer_max_events: int, + buffer_max_size_mib: int, ): """Patch the charm's initializer.""" original_init = charm_type.__init__ @@ -442,18 +693,11 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): logger.info("Tracing DISABLED: skipping root span initialization") return - # already init some attrs that will be reinited later by calling original_init: - # self.framework = framework - # self.handle = Handle(None, self.handle_kind, None) - original_event_context = framework._event_context # default service name isn't just app name because it could conflict with the workload service name _service_name = service_name or f"{self.app.name}-charm" unit_name = self.unit.name - # apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm. - # it could be trouble if someone ever decides to implement their own tracer parallel to - # ours and before the charm has inited. We assume they won't. resource = Resource.create( attributes={ "service.name": _service_name, @@ -471,33 +715,60 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): # if anything goes wrong with retrieving the endpoint, we let the exception bubble up. tracing_endpoint = _get_tracing_endpoint(tracing_endpoint_attr, self, charm_type) + buffer_only = False + # whether we're only exporting to buffer, or also to the otlp exporter. + if not tracing_endpoint: # tracing is off if tracing_endpoint is None - return + # however we can buffer things until tracing comes online + buffer_only = True server_cert: Optional[Union[str, Path]] = ( _get_server_cert(server_cert_attr, self, charm_type) if server_cert_attr else None ) - if tracing_endpoint.startswith("https://") and not server_cert: + if (tracing_endpoint and tracing_endpoint.startswith("https://")) and not server_cert: logger.error( "Tracing endpoint is https, but no server_cert has been passed." "Please point @trace_charm to a `server_cert` attr. " "This might also mean that the tracing provider is related to a " "certificates provider, but this application is not (yet). " "In that case, you might just have to wait a bit for the certificates " - "integration to settle. " + "integration to settle. This span will be buffered." ) - return + buffer_only = True - exporter = OTLPSpanExporter( - endpoint=tracing_endpoint, - certificate_file=str(Path(server_cert).absolute()) if server_cert else None, - timeout=2, + buffer = _Buffer( + db_file=buffer_path or Path() / BUFFER_DEFAULT_CACHE_FILE_NAME, + max_event_history_length=buffer_max_events, + max_buffer_size_mib=buffer_max_size_mib, ) + previous_spans_buffered = not buffer.is_empty + + exporters: List[SpanExporter] = [] + if buffer_only: + # we have to buffer because we're missing necessary backend configuration + dev_logger.debug("buffering mode: ON") + exporters.append(_BufferedExporter(buffer)) + + else: + dev_logger.debug("buffering mode: FALLBACK") + # in principle, we have the right configuration to be pushing traces, + # but if we fail for whatever reason, we will put everything in the buffer + # and retry the next time + otlp_exporter = _OTLPSpanExporter( + endpoint=tracing_endpoint, + certificate_file=str(Path(server_cert).absolute()) if server_cert else None, + timeout=_OTLP_SPAN_EXPORTER_TIMEOUT, # give individual requests 1 second to succeed + ) + exporters.append(otlp_exporter) + exporters.append(_BufferedExporter(buffer)) + buffer.exporter = otlp_exporter + + for exporter in exporters: + processor = BatchSpanProcessor(exporter) + provider.add_span_processor(processor) - processor = BatchSpanProcessor(exporter) - provider.add_span_processor(processor) set_tracer_provider(provider) _tracer = get_tracer(_service_name) # type: ignore _tracer_token = tracer.set(_tracer) @@ -521,7 +792,7 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): @contextmanager def wrap_event_context(event_name: str): - dev_logger.info(f"entering event context: {event_name}") + dev_logger.debug(f"entering event context: {event_name}") # when the framework enters an event context, we create a span. with _span("event: " + event_name) as event_context_span: if event_context_span: @@ -535,12 +806,50 @@ def wrap_event_context(event_name: str): @functools.wraps(original_close) def wrap_close(): - dev_logger.info("tearing down tracer and flushing traces") + dev_logger.debug("tearing down tracer and flushing traces") span.end() opentelemetry.context.detach(span_token) # type: ignore tracer.reset(_tracer_token) tp = cast(TracerProvider, get_tracer_provider()) - tp.force_flush(timeout_millis=1000) # don't block for too long + flush_successful = tp.force_flush(timeout_millis=1000) # don't block for too long + + if buffer_only: + # if we're in buffer_only mode, it means we couldn't even set up the exporter for + # tempo as we're missing some data. + # so attempting to flush the buffer doesn't make sense + dev_logger.debug("tracing backend unavailable: all spans pushed to buffer") + + else: + dev_logger.debug("tracing backend found: attempting to flush buffer...") + + # if we do have an exporter for tempo, and we could send traces to it, + # we can attempt to flush the buffer as well. + if not flush_successful: + logger.error("flushing FAILED: unable to push traces to backend.") + else: + dev_logger.debug("flush succeeded.") + + # the backend has accepted the spans generated during this event, + if not previous_spans_buffered: + # if the buffer was empty to begin with, any spans we collected now can be discarded + buffer.drop() + dev_logger.debug("buffer dropped: this trace has been sent already") + else: + # if the buffer was nonempty, we can attempt to flush it + dev_logger.debug("attempting buffer flush...") + buffer_flush_successful = buffer.flush() + if buffer_flush_successful: + dev_logger.debug("buffer flush OK") + elif buffer_flush_successful is None: + # TODO is this even possible? + dev_logger.debug("buffer flush OK; empty: nothing to flush") + else: + # this situation is pretty weird, I'm not even sure it can happen, + # because it would mean that we did manage + # to push traces directly to the tempo exporter (flush_successful), + # but the buffer flush failed to push to the same exporter! + logger.error("buffer flush FAILED") + tp.shutdown() original_close() @@ -555,6 +864,9 @@ def trace_charm( server_cert: Optional[str] = None, service_name: Optional[str] = None, extra_types: Sequence[type] = (), + buffer_max_events: int = BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH, + buffer_max_size_mib: int = BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB, + buffer_path: Optional[Union[str, Path]] = None, ) -> Callable[[_T], _T]: """Autoinstrument the decorated charm with tracing telemetry. @@ -596,6 +908,10 @@ def trace_charm( Defaults to the juju application name this charm is deployed under. :param extra_types: pass any number of types that you also wish to autoinstrument. For example, charm libs, relation endpoint wrappers, workload abstractions, ... + :param buffer_max_events: max number of events to save in the buffer. Set to 0 to disable buffering. + :param buffer_max_size_mib: max size of the buffer file. When exceeded, spans will be dropped. + Minimum 10MiB. + :param buffer_path: path to buffer file to use for saving buffered spans. """ def _decorator(charm_type: _T) -> _T: @@ -606,6 +922,9 @@ def _decorator(charm_type: _T) -> _T: server_cert_attr=server_cert, service_name=service_name, extra_types=extra_types, + buffer_path=Path(buffer_path) if buffer_path else None, + buffer_max_size_mib=buffer_max_size_mib, + buffer_max_events=buffer_max_events, ) return charm_type @@ -618,6 +937,9 @@ def _autoinstrument( server_cert_attr: Optional[str] = None, service_name: Optional[str] = None, extra_types: Sequence[type] = (), + buffer_max_events: int = BUFFER_DEFAULT_MAX_EVENT_HISTORY_LENGTH, + buffer_max_size_mib: int = BUFFER_DEFAULT_CACHE_FILE_SIZE_LIMIT_MiB, + buffer_path: Optional[Path] = None, ) -> _T: """Set up tracing on this charm class. @@ -650,13 +972,20 @@ def _autoinstrument( Defaults to the juju application name this charm is deployed under. :param extra_types: pass any number of types that you also wish to autoinstrument. For example, charm libs, relation endpoint wrappers, workload abstractions, ... + :param buffer_max_events: max number of events to save in the buffer. Set to 0 to disable buffering. + :param buffer_max_size_mib: max size of the buffer file. When exceeded, spans will be dropped. + Minimum 10MiB. + :param buffer_path: path to buffer file to use for saving buffered spans. """ - dev_logger.info(f"instrumenting {charm_type}") + dev_logger.debug(f"instrumenting {charm_type}") _setup_root_span_initializer( charm_type, tracing_endpoint_attr, server_cert_attr=server_cert_attr, service_name=service_name, + buffer_path=buffer_path, + buffer_max_events=buffer_max_events, + buffer_max_size_mib=buffer_max_size_mib, ) trace_type(charm_type) for type_ in extra_types: @@ -672,12 +1001,12 @@ def trace_type(cls: _T) -> _T: It assumes that this class is only instantiated after a charm type decorated with `@trace_charm` has been instantiated. """ - dev_logger.info(f"instrumenting {cls}") + dev_logger.debug(f"instrumenting {cls}") for name, method in inspect.getmembers(cls, predicate=inspect.isfunction): - dev_logger.info(f"discovered {method}") + dev_logger.debug(f"discovered {method}") if method.__name__.startswith("__"): - dev_logger.info(f"skipping {method} (dunder)") + dev_logger.debug(f"skipping {method} (dunder)") continue # the span title in the general case should be: @@ -723,7 +1052,7 @@ def trace_function(function: _F, name: Optional[str] = None) -> _F: def _trace_callable(callable: _F, qualifier: str, name: Optional[str] = None) -> _F: - dev_logger.info(f"instrumenting {callable}") + dev_logger.debug(f"instrumenting {callable}") # sig = inspect.signature(callable) @functools.wraps(callable) diff --git a/lib/charms/tempo_coordinator_k8s/v0/tracing.py b/lib/charms/tempo_coordinator_k8s/v0/tracing.py index 1f92867..2035dff 100644 --- a/lib/charms/tempo_coordinator_k8s/v0/tracing.py +++ b/lib/charms/tempo_coordinator_k8s/v0/tracing.py @@ -34,7 +34,7 @@ def __init__(self, *args): `TracingEndpointRequirer.request_protocols(*protocol:str, relation:Optional[Relation])` method. Using this method also allows you to use per-relation protocols. -Units of provider charms obtain the tempo endpoint to which they will push their traces by calling +Units of requirer charms obtain the tempo endpoint to which they will push their traces by calling `TracingEndpointRequirer.get_endpoint(protocol: str)`, where `protocol` is, for example: - `otlp_grpc` - `otlp_http` @@ -44,7 +44,10 @@ def __init__(self, *args): If the `protocol` is not in the list of protocols that the charm requested at endpoint set-up time, the library will raise an error. -## Requirer Library Usage +We recommend that you scale up your tracing provider and relate it to an ingress so that your tracing requests +go through the ingress and get load balanced across all units. Otherwise, if the provider's leader goes down, your tracing goes down. + +## Provider Library Usage The `TracingEndpointProvider` object may be used by charms to manage relations with their trace sources. For this purposes a Tempo-like charm needs to do two things @@ -107,7 +110,7 @@ def __init__(self, *args): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 2 +LIBPATCH = 3 PYDEPS = ["pydantic"] diff --git a/lib/charms/tempo_k8s/v1/charm_tracing.py b/lib/charms/tempo_k8s/v1/charm_tracing.py deleted file mode 100644 index 2dbdddd..0000000 --- a/lib/charms/tempo_k8s/v1/charm_tracing.py +++ /dev/null @@ -1,760 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2022 Canonical Ltd. -# See LICENSE file for licensing details. - -"""This charm library contains utilities to instrument your Charm with opentelemetry tracing data collection. - -(yes! charm code, not workload code!) - -This means that, if your charm is related to, for example, COS' Tempo charm, you will be able to inspect -in real time from the Grafana dashboard the execution flow of your charm. - -# Quickstart -Fetch the following charm libs (and ensure the minimum version/revision numbers are satisfied): - - charmcraft fetch-lib charms.tempo_k8s.v2.tracing # >= 1.10 - charmcraft fetch-lib charms.tempo_k8s.v1.charm_tracing # >= 2.7 - -Then edit your charm code to include: - -```python -# import the necessary charm libs -from charms.tempo_k8s.v2.tracing import TracingEndpointRequirer, charm_tracing_config -from charms.tempo_k8s.v1.charm_tracing import charm_tracing - -# decorate your charm class with charm_tracing: -@charm_tracing( - # forward-declare the instance attributes that the instrumentor will look up to obtain the - # tempo endpoint and server certificate - tracing_endpoint="tracing_endpoint", - server_cert="server_cert" -) -class MyCharm(CharmBase): - _path_to_cert = "/path/to/cert.crt" - # path to cert file **in the charm container**. Its presence will be used to determine whether - # the charm is ready to use tls for encrypting charm traces. If your charm does not support tls, - # you can ignore this and pass None to charm_tracing_config. - # If you do support TLS, you'll need to make sure that the server cert is copied to this location - # and kept up to date so the instrumentor can use it. - - def __init__(self, ...): - ... - self.tracing = TracingEndpointRequirer(self, ...) - self.tracing_endpoint, self.server_cert = charm_tracing_config(self.tracing, self._path_to_cert) -``` - -# Detailed usage -To use this library, you need to do two things: -1) decorate your charm class with - -`@trace_charm(tracing_endpoint="my_tracing_endpoint")` - -2) add to your charm a "my_tracing_endpoint" (you can name this attribute whatever you like) -**property**, **method** or **instance attribute** that returns an otlp http/https endpoint url. -If you are using the ``charms.tempo_k8s.v2.tracing.TracingEndpointRequirer`` as -``self.tracing = TracingEndpointRequirer(self)``, the implementation could be: - -``` - @property - def my_tracing_endpoint(self) -> Optional[str]: - '''Tempo endpoint for charm tracing''' - if self.tracing.is_ready(): - return self.tracing.get_endpoint("otlp_http") - else: - return None -``` - -At this point your charm will be automatically instrumented so that: -- charm execution starts a trace, containing - - every event as a span (including custom events) - - every charm method call (except dunders) as a span - - -## TLS support -If your charm integrates with a TLS provider which is also trusted by the tracing provider (the Tempo charm), -you can configure ``charm_tracing`` to use TLS by passing a ``server_cert`` parameter to the decorator. - -If your charm is not trusting the same CA as the Tempo endpoint it is sending traces to, -you'll need to implement a cert-transfer relation to obtain the CA certificate from the same -CA that Tempo is using. - -For example: -``` -from charms.tempo_k8s.v1.charm_tracing import trace_charm -@trace_charm( - tracing_endpoint="my_tracing_endpoint", - server_cert="_server_cert" -) -class MyCharm(CharmBase): - self._server_cert = "/path/to/server.crt" - ... - - def on_tls_changed(self, e) -> Optional[str]: - # update the server cert on the charm container for charm tracing - Path(self._server_cert).write_text(self.get_server_cert()) - - def on_tls_broken(self, e) -> Optional[str]: - # remove the server cert so charm_tracing won't try to use tls anymore - Path(self._server_cert).unlink() -``` - - -## More fine-grained manual instrumentation -if you wish to add more spans to the trace, you can do so by getting a hold of the tracer like so: -``` -import opentelemetry -... -def get_tracer(self) -> opentelemetry.trace.Tracer: - return opentelemetry.trace.get_tracer(type(self).__name__) -``` - -By default, the tracer is named after the charm type. If you wish to override that, you can pass -a different ``service_name`` argument to ``trace_charm``. - -See the official opentelemetry Python SDK documentation for usage: -https://opentelemetry-python.readthedocs.io/en/latest/ - -## Upgrading from `v0` - -If you are upgrading from `charm_tracing` v0, you need to take the following steps (assuming you already -have the newest version of the library in your charm): -1) If you need the dependency for your tests, add the following dependency to your charm project -(or, if your project had a dependency on `opentelemetry-exporter-otlp-proto-grpc` only because -of `charm_tracing` v0, you can replace it with): - -`opentelemetry-exporter-otlp-proto-http>=1.21.0`. - -2) Update the charm method referenced to from ``@trace`` and ``@trace_charm``, -to return from ``TracingEndpointRequirer.get_endpoint("otlp_http")`` instead of ``grpc_http``. -For example: - -``` - from charms.tempo_k8s.v0.charm_tracing import trace_charm - - @trace_charm( - tracing_endpoint="my_tracing_endpoint", - ) - class MyCharm(CharmBase): - - ... - - @property - def my_tracing_endpoint(self) -> Optional[str]: - '''Tempo endpoint for charm tracing''' - if self.tracing.is_ready(): - return self.tracing.otlp_grpc_endpoint() # OLD API, DEPRECATED. - else: - return None -``` - -needs to be replaced with: - -``` - from charms.tempo_k8s.v1.charm_tracing import trace_charm - - @trace_charm( - tracing_endpoint="my_tracing_endpoint", - ) - class MyCharm(CharmBase): - - ... - - @property - def my_tracing_endpoint(self) -> Optional[str]: - '''Tempo endpoint for charm tracing''' - if self.tracing.is_ready(): - return self.tracing.get_endpoint("otlp_http") # NEW API, use this. - else: - return None -``` - -3) If you were passing a certificate (str) using `server_cert`, you need to change it to -provide an *absolute* path to the certificate file instead. -""" - - -def _remove_stale_otel_sdk_packages(): - """Hack to remove stale opentelemetry sdk packages from the charm's python venv. - - See https://github.com/canonical/grafana-agent-operator/issues/146 and - https://bugs.launchpad.net/juju/+bug/2058335 for more context. This patch can be removed after - this juju issue is resolved and sufficient time has passed to expect most users of this library - have migrated to the patched version of juju. When this patch is removed, un-ignore rule E402 for this file in the pyproject.toml (see setting - [tool.ruff.lint.per-file-ignores] in pyproject.toml). - - This only has an effect if executed on an upgrade-charm event. - """ - # all imports are local to keep this function standalone, side-effect-free, and easy to revert later - import os - - if os.getenv("JUJU_DISPATCH_PATH") != "hooks/upgrade-charm": - return - - import logging - import shutil - from collections import defaultdict - - from importlib_metadata import distributions - - otel_logger = logging.getLogger("charm_tracing_otel_patcher") - otel_logger.debug("Applying _remove_stale_otel_sdk_packages patch on charm upgrade") - # group by name all distributions starting with "opentelemetry_" - otel_distributions = defaultdict(list) - for distribution in distributions(): - name = distribution._normalized_name # type: ignore - if name.startswith("opentelemetry_"): - otel_distributions[name].append(distribution) - - otel_logger.debug(f"Found {len(otel_distributions)} opentelemetry distributions") - - # If we have multiple distributions with the same name, remove any that have 0 associated files - for name, distributions_ in otel_distributions.items(): - if len(distributions_) <= 1: - continue - - otel_logger.debug(f"Package {name} has multiple ({len(distributions_)}) distributions.") - for distribution in distributions_: - if not distribution.files: # Not None or empty list - path = distribution._path # type: ignore - otel_logger.info(f"Removing empty distribution of {name} at {path}.") - shutil.rmtree(path) - - otel_logger.debug("Successfully applied _remove_stale_otel_sdk_packages patch. ") - - -_remove_stale_otel_sdk_packages() - -import functools -import inspect -import logging -import os -from contextlib import contextmanager -from contextvars import Context, ContextVar, copy_context -from pathlib import Path -from typing import ( - Any, - Callable, - Generator, - Optional, - Sequence, - Type, - TypeVar, - Union, - cast, -) - -import opentelemetry -import ops -from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter -from opentelemetry.sdk.resources import Resource -from opentelemetry.sdk.trace import Span, TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor -from opentelemetry.trace import ( - INVALID_SPAN, - Tracer, - get_tracer, - get_tracer_provider, - set_span_in_context, - set_tracer_provider, -) -from opentelemetry.trace import get_current_span as otlp_get_current_span -from ops.charm import CharmBase -from ops.framework import Framework - -# The unique Charmhub library identifier, never change it -LIBID = "cb1705dcd1a14ca09b2e60187d1215c7" - -# Increment this major API version when introducing breaking changes -LIBAPI = 1 - -# Increment this PATCH version before using `charmcraft publish-lib` or reset -# to 0 if you are raising the major API version - -LIBPATCH = 15 - -PYDEPS = ["opentelemetry-exporter-otlp-proto-http==1.21.0"] - -logger = logging.getLogger("tracing") -dev_logger = logging.getLogger("tracing-dev") - -# set this to 0 if you are debugging/developing this library source -dev_logger.setLevel(logging.CRITICAL) - -_CharmType = Type[CharmBase] # the type CharmBase and any subclass thereof -_C = TypeVar("_C", bound=_CharmType) -_T = TypeVar("_T", bound=type) -_F = TypeVar("_F", bound=Type[Callable]) -tracer: ContextVar[Tracer] = ContextVar("tracer") -_GetterType = Union[Callable[[_CharmType], Optional[str]], property] - -CHARM_TRACING_ENABLED = "CHARM_TRACING_ENABLED" - - -def is_enabled() -> bool: - """Whether charm tracing is enabled.""" - return os.getenv(CHARM_TRACING_ENABLED, "1") == "1" - - -@contextmanager -def charm_tracing_disabled(): - """Contextmanager to temporarily disable charm tracing. - - For usage in tests. - """ - previous = os.getenv(CHARM_TRACING_ENABLED, "1") - os.environ[CHARM_TRACING_ENABLED] = "0" - yield - os.environ[CHARM_TRACING_ENABLED] = previous - - -def get_current_span() -> Union[Span, None]: - """Return the currently active Span, if there is one, else None. - - If you'd rather keep your logic unconditional, you can use opentelemetry.trace.get_current_span, - which will return an object that behaves like a span but records no data. - """ - span = otlp_get_current_span() - if span is INVALID_SPAN: - return None - return cast(Span, span) - - -def _get_tracer_from_context(ctx: Context) -> Optional[ContextVar]: - tracers = [v for v in ctx if v is not None and v.name == "tracer"] - if tracers: - return tracers[0] - return None - - -def _get_tracer() -> Optional[Tracer]: - """Find tracer in context variable and as a fallback locate it in the full context.""" - try: - return tracer.get() - except LookupError: - # fallback: this course-corrects for a user error where charm_tracing symbols are imported - # from different paths (typically charms.tempo_k8s... and lib.charms.tempo_k8s...) - try: - ctx: Context = copy_context() - if context_tracer := _get_tracer_from_context(ctx): - logger.warning( - "Tracer not found in `tracer` context var. " - "Verify that you're importing all `charm_tracing` symbols from the same module path. \n" - "For example, DO" - ": `from charms.lib...charm_tracing import foo, bar`. \n" - "DONT: \n" - " \t - `from charms.lib...charm_tracing import foo` \n" - " \t - `from lib...charm_tracing import bar` \n" - "For more info: https://python-notes.curiousefficiency.org/en/latest/python" - "_concepts/import_traps.html#the-double-import-trap" - ) - return context_tracer.get() - else: - return None - except LookupError: - return None - - -@contextmanager -def _span(name: str) -> Generator[Optional[Span], Any, Any]: - """Context to create a span if there is a tracer, otherwise do nothing.""" - if tracer := _get_tracer(): - with tracer.start_as_current_span(name) as span: - yield cast(Span, span) - else: - yield None - - -class TracingError(RuntimeError): - """Base class for errors raised by this module.""" - - -class UntraceableObjectError(TracingError): - """Raised when an object you're attempting to instrument cannot be autoinstrumented.""" - - -class TLSError(TracingError): - """Raised when the tracing endpoint is https but we don't have a cert yet.""" - - -def _get_tracing_endpoint( - tracing_endpoint_attr: str, - charm_instance: object, - charm_type: type, -): - _tracing_endpoint = getattr(charm_instance, tracing_endpoint_attr) - if callable(_tracing_endpoint): - tracing_endpoint = _tracing_endpoint() - else: - tracing_endpoint = _tracing_endpoint - - if tracing_endpoint is None: - return - - elif not isinstance(tracing_endpoint, str): - raise TypeError( - f"{charm_type.__name__}.{tracing_endpoint_attr} should resolve to a tempo endpoint (string); " - f"got {tracing_endpoint} instead." - ) - - dev_logger.debug(f"Setting up span exporter to endpoint: {tracing_endpoint}/v1/traces") - return f"{tracing_endpoint}/v1/traces" - - -def _get_server_cert( - server_cert_attr: str, - charm_instance: ops.CharmBase, - charm_type: Type[ops.CharmBase], -): - _server_cert = getattr(charm_instance, server_cert_attr) - if callable(_server_cert): - server_cert = _server_cert() - else: - server_cert = _server_cert - - if server_cert is None: - logger.warning( - f"{charm_type}.{server_cert_attr} is None; sending traces over INSECURE connection." - ) - return - elif not Path(server_cert).is_absolute(): - raise ValueError( - f"{charm_type}.{server_cert_attr} should resolve to a valid tls cert absolute path (string | Path)); " - f"got {server_cert} instead." - ) - return server_cert - - -def _setup_root_span_initializer( - charm_type: _CharmType, - tracing_endpoint_attr: str, - server_cert_attr: Optional[str], - service_name: Optional[str] = None, -): - """Patch the charm's initializer.""" - original_init = charm_type.__init__ - - @functools.wraps(original_init) - def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): - # we're using 'self' here because this is charm init code, makes sense to read what's below - # from the perspective of the charm. Self.unit.name... - - original_init(self, framework, *args, **kwargs) - # we call this from inside the init context instead of, say, _autoinstrument, because we want it to - # be checked on a per-charm-instantiation basis, not on a per-type-declaration one. - if not is_enabled(): - # this will only happen during unittesting, hopefully, so it's fine to log a - # bit more verbosely - logger.info("Tracing DISABLED: skipping root span initialization") - return - - # already init some attrs that will be reinited later by calling original_init: - # self.framework = framework - # self.handle = Handle(None, self.handle_kind, None) - - original_event_context = framework._event_context - # default service name isn't just app name because it could conflict with the workload service name - _service_name = service_name or f"{self.app.name}-charm" - - unit_name = self.unit.name - # apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm. - # it could be trouble if someone ever decides to implement their own tracer parallel to - # ours and before the charm has inited. We assume they won't. - resource = Resource.create( - attributes={ - "service.name": _service_name, - "compose_service": _service_name, - "charm_type": type(self).__name__, - # juju topology - "juju_unit": unit_name, - "juju_application": self.app.name, - "juju_model": self.model.name, - "juju_model_uuid": self.model.uuid, - } - ) - provider = TracerProvider(resource=resource) - - # if anything goes wrong with retrieving the endpoint, we let the exception bubble up. - tracing_endpoint = _get_tracing_endpoint(tracing_endpoint_attr, self, charm_type) - - if not tracing_endpoint: - # tracing is off if tracing_endpoint is None - return - - server_cert: Optional[Union[str, Path]] = ( - _get_server_cert(server_cert_attr, self, charm_type) if server_cert_attr else None - ) - - if tracing_endpoint.startswith("https://") and not server_cert: - raise TLSError( - "Tracing endpoint is https, but no server_cert has been passed." - "Please point @trace_charm to a `server_cert` attr." - ) - - exporter = OTLPSpanExporter( - endpoint=tracing_endpoint, - certificate_file=str(Path(server_cert).absolute()) if server_cert else None, - timeout=2, - ) - - processor = BatchSpanProcessor(exporter) - provider.add_span_processor(processor) - set_tracer_provider(provider) - _tracer = get_tracer(_service_name) # type: ignore - _tracer_token = tracer.set(_tracer) - - dispatch_path = os.getenv("JUJU_DISPATCH_PATH", "") # something like hooks/install - event_name = dispatch_path.split("/")[1] if "/" in dispatch_path else dispatch_path - root_span_name = f"{unit_name}: {event_name} event" - span = _tracer.start_span(root_span_name, attributes={"juju.dispatch_path": dispatch_path}) - - # all these shenanigans are to work around the fact that the opentelemetry tracing API is built - # on the assumption that spans will be used as contextmanagers. - # Since we don't (as we need to close the span on framework.commit), - # we need to manually set the root span as current. - ctx = set_span_in_context(span) - - # log a trace id, so we can pick it up from the logs (and jhack) to look it up in tempo. - root_trace_id = hex(span.get_span_context().trace_id)[2:] # strip 0x prefix - logger.debug(f"Starting root trace with id={root_trace_id!r}.") - - span_token = opentelemetry.context.attach(ctx) # type: ignore - - @contextmanager - def wrap_event_context(event_name: str): - dev_logger.info(f"entering event context: {event_name}") - # when the framework enters an event context, we create a span. - with _span("event: " + event_name) as event_context_span: - if event_context_span: - # todo: figure out how to inject event attrs in here - event_context_span.add_event(event_name) - yield original_event_context(event_name) - - framework._event_context = wrap_event_context # type: ignore - - original_close = framework.close - - @functools.wraps(original_close) - def wrap_close(): - dev_logger.info("tearing down tracer and flushing traces") - span.end() - opentelemetry.context.detach(span_token) # type: ignore - tracer.reset(_tracer_token) - tp = cast(TracerProvider, get_tracer_provider()) - tp.force_flush(timeout_millis=1000) # don't block for too long - tp.shutdown() - original_close() - - framework.close = wrap_close - return - - charm_type.__init__ = wrap_init # type: ignore - - -def trace_charm( - tracing_endpoint: str, - server_cert: Optional[str] = None, - service_name: Optional[str] = None, - extra_types: Sequence[type] = (), -) -> Callable[[_T], _T]: - """Autoinstrument the decorated charm with tracing telemetry. - - Use this function to get out-of-the-box traces for all events emitted on this charm and all - method calls on instances of this class. - - Usage: - >>> from charms.tempo_k8s.v1.charm_tracing import trace_charm - >>> from charms.tempo_k8s.v1.tracing import TracingEndpointRequirer - >>> from ops import CharmBase - >>> - >>> @trace_charm( - >>> tracing_endpoint="tempo_otlp_http_endpoint", - >>> ) - >>> class MyCharm(CharmBase): - >>> - >>> def __init__(self, framework: Framework): - >>> ... - >>> self.tracing = TracingEndpointRequirer(self) - >>> - >>> @property - >>> def tempo_otlp_http_endpoint(self) -> Optional[str]: - >>> if self.tracing.is_ready(): - >>> return self.tracing.otlp_http_endpoint() - >>> else: - >>> return None - >>> - - :param tracing_endpoint: name of a method, property or attribute on the charm type that returns an - optional (fully resolvable) tempo url to which the charm traces will be pushed. - If None, tracing will be effectively disabled. - :param server_cert: name of a method, property or attribute on the charm type that returns an - optional absolute path to a CA certificate file to be used when sending traces to a remote server. - If it returns None, an _insecure_ connection will be used. To avoid errors in transient - situations where the endpoint is already https but there is no certificate on disk yet, it - is recommended to disable tracing (by returning None from the tracing_endpoint) altogether - until the cert has been written to disk. - :param service_name: service name tag to attach to all traces generated by this charm. - Defaults to the juju application name this charm is deployed under. - :param extra_types: pass any number of types that you also wish to autoinstrument. - For example, charm libs, relation endpoint wrappers, workload abstractions, ... - """ - - def _decorator(charm_type: _T) -> _T: - """Autoinstrument the wrapped charmbase type.""" - _autoinstrument( - charm_type, - tracing_endpoint_attr=tracing_endpoint, - server_cert_attr=server_cert, - service_name=service_name, - extra_types=extra_types, - ) - return charm_type - - return _decorator - - -def _autoinstrument( - charm_type: _T, - tracing_endpoint_attr: str, - server_cert_attr: Optional[str] = None, - service_name: Optional[str] = None, - extra_types: Sequence[type] = (), -) -> _T: - """Set up tracing on this charm class. - - Use this function to get out-of-the-box traces for all events emitted on this charm and all - method calls on instances of this class. - - Usage: - - >>> from charms.tempo_k8s.v1.charm_tracing import _autoinstrument - >>> from ops.main import main - >>> _autoinstrument( - >>> MyCharm, - >>> tracing_endpoint_attr="tempo_otlp_http_endpoint", - >>> service_name="MyCharm", - >>> extra_types=(Foo, Bar) - >>> ) - >>> main(MyCharm) - - :param charm_type: the CharmBase subclass to autoinstrument. - :param tracing_endpoint_attr: name of a method, property or attribute on the charm type that returns an - optional (fully resolvable) tempo url to which the charm traces will be pushed. - If None, tracing will be effectively disabled. - :param server_cert_attr: name of a method, property or attribute on the charm type that returns an - optional absolute path to a CA certificate file to be used when sending traces to a remote server. - If it returns None, an _insecure_ connection will be used. To avoid errors in transient - situations where the endpoint is already https but there is no certificate on disk yet, it - is recommended to disable tracing (by returning None from the tracing_endpoint) altogether - until the cert has been written to disk. - :param service_name: service name tag to attach to all traces generated by this charm. - Defaults to the juju application name this charm is deployed under. - :param extra_types: pass any number of types that you also wish to autoinstrument. - For example, charm libs, relation endpoint wrappers, workload abstractions, ... - """ - dev_logger.info(f"instrumenting {charm_type}") - _setup_root_span_initializer( - charm_type, - tracing_endpoint_attr, - server_cert_attr=server_cert_attr, - service_name=service_name, - ) - trace_type(charm_type) - for type_ in extra_types: - trace_type(type_) - - return charm_type - - -def trace_type(cls: _T) -> _T: - """Set up tracing on this class. - - Use this decorator to get out-of-the-box traces for all method calls on instances of this class. - It assumes that this class is only instantiated after a charm type decorated with `@trace_charm` - has been instantiated. - """ - dev_logger.info(f"instrumenting {cls}") - for name, method in inspect.getmembers(cls, predicate=inspect.isfunction): - dev_logger.info(f"discovered {method}") - - if method.__name__.startswith("__"): - dev_logger.info(f"skipping {method} (dunder)") - continue - - # the span title in the general case should be: - # method call: MyCharmWrappedMethods.b - # if the method has a name (functools.wrapped or regular method), let - # _trace_callable use its default algorithm to determine what name to give the span. - trace_method_name = None - try: - qualname_c0 = method.__qualname__.split(".")[0] - if not hasattr(cls, method.__name__): - # if the callable doesn't have a __name__ (probably a decorated method), - # it probably has a bad qualname too (such as my_decorator..wrapper) which is not - # great for finding out what the trace is about. So we use the method name instead and - # add a reference to the decorator name. Result: - # method call: @my_decorator(MyCharmWrappedMethods.b) - trace_method_name = f"@{qualname_c0}({cls.__name__}.{name})" - except Exception: # noqa: failsafe - pass - - new_method = trace_method(method, name=trace_method_name) - - if isinstance(inspect.getattr_static(cls, name), staticmethod): - new_method = staticmethod(new_method) - setattr(cls, name, new_method) - - return cls - - -def trace_method(method: _F, name: Optional[str] = None) -> _F: - """Trace this method. - - A span will be opened when this method is called and closed when it returns. - """ - return _trace_callable(method, "method", name=name) - - -def trace_function(function: _F, name: Optional[str] = None) -> _F: - """Trace this function. - - A span will be opened when this function is called and closed when it returns. - """ - return _trace_callable(function, "function", name=name) - - -def _trace_callable(callable: _F, qualifier: str, name: Optional[str] = None) -> _F: - dev_logger.info(f"instrumenting {callable}") - - # sig = inspect.signature(callable) - @functools.wraps(callable) - def wrapped_function(*args, **kwargs): # type: ignore - name_ = name or getattr( - callable, "__qualname__", getattr(callable, "__name__", str(callable)) - ) - with _span(f"{qualifier} call: {name_}"): # type: ignore - return callable(*args, **kwargs) # type: ignore - - # wrapped_function.__signature__ = sig - return wrapped_function # type: ignore - - -def trace(obj: Union[Type, Callable]): - """Trace this object and send the resulting spans to Tempo. - - It will dispatch to ``trace_type`` if the decorated object is a class, otherwise - ``trace_function``. - """ - if isinstance(obj, type): - if issubclass(obj, CharmBase): - raise ValueError( - "cannot use @trace on CharmBase subclasses: use @trace_charm instead " - "(we need some arguments!)" - ) - return trace_type(obj) - else: - try: - return trace_function(obj) - except Exception: - raise UntraceableObjectError( - f"cannot create span from {type(obj)}; instrument {obj} manually." - ) diff --git a/lib/charms/tempo_k8s/v2/tracing.py b/lib/charms/tempo_k8s/v2/tracing.py deleted file mode 100644 index 81bf1f1..0000000 --- a/lib/charms/tempo_k8s/v2/tracing.py +++ /dev/null @@ -1,996 +0,0 @@ -# Copyright 2024 Canonical Ltd. -# See LICENSE file for licensing details. -"""## Overview. - -This document explains how to integrate with the Tempo charm for the purpose of pushing traces to a -tracing endpoint provided by Tempo. It also explains how alternative implementations of the Tempo charm -may maintain the same interface and be backward compatible with all currently integrated charms. - -## Requirer Library Usage - -Charms seeking to push traces to Tempo, must do so using the `TracingEndpointRequirer` -object from this charm library. For the simplest use cases, using the `TracingEndpointRequirer` -object only requires instantiating it, typically in the constructor of your charm. The -`TracingEndpointRequirer` constructor requires the name of the relation over which a tracing endpoint - is exposed by the Tempo charm, and a list of protocols it intends to send traces with. - This relation must use the `tracing` interface. - The `TracingEndpointRequirer` object may be instantiated as follows - - from charms.tempo_k8s.v2.tracing import TracingEndpointRequirer - - def __init__(self, *args): - super().__init__(*args) - # ... - self.tracing = TracingEndpointRequirer(self, - protocols=['otlp_grpc', 'otlp_http', 'jaeger_http_thrift'] - ) - # ... - -Note that the first argument (`self`) to `TracingEndpointRequirer` is always a reference to the -parent charm. - -Alternatively to providing the list of requested protocols at init time, the charm can do it at -any point in time by calling the -`TracingEndpointRequirer.request_protocols(*protocol:str, relation:Optional[Relation])` method. -Using this method also allows you to use per-relation protocols. - -Units of provider charms obtain the tempo endpoint to which they will push their traces by calling -`TracingEndpointRequirer.get_endpoint(protocol: str)`, where `protocol` is, for example: -- `otlp_grpc` -- `otlp_http` -- `zipkin` -- `tempo` - -If the `protocol` is not in the list of protocols that the charm requested at endpoint set-up time, -the library will raise an error. - -## Requirer Library Usage - -The `TracingEndpointProvider` object may be used by charms to manage relations with their -trace sources. For this purposes a Tempo-like charm needs to do two things - -1. Instantiate the `TracingEndpointProvider` object by providing it a -reference to the parent (Tempo) charm and optionally the name of the relation that the Tempo charm -uses to interact with its trace sources. This relation must conform to the `tracing` interface -and it is strongly recommended that this relation be named `tracing` which is its -default value. - -For example a Tempo charm may instantiate the `TracingEndpointProvider` in its constructor as -follows - - from charms.tempo_k8s.v2.tracing import TracingEndpointProvider - - def __init__(self, *args): - super().__init__(*args) - # ... - self.tracing = TracingEndpointProvider(self) - # ... - - - -""" # noqa: W505 -import enum -import json -import logging -from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - Dict, - List, - Literal, - MutableMapping, - Optional, - Sequence, - Tuple, - Union, - cast, -) - -import pydantic -from ops.charm import ( - CharmBase, - CharmEvents, - RelationBrokenEvent, - RelationEvent, - RelationRole, -) -from ops.framework import EventSource, Object -from ops.model import ModelError, Relation -from pydantic import BaseModel, Field - -# The unique Charmhub library identifier, never change it -LIBID = "12977e9aa0b34367903d8afeb8c3d85d" - -# Increment this major API version when introducing breaking changes -LIBAPI = 2 - -# Increment this PATCH version before using `charmcraft publish-lib` or reset -# to 0 if you are raising the major API version -LIBPATCH = 10 - -PYDEPS = ["pydantic"] - -logger = logging.getLogger(__name__) - -DEFAULT_RELATION_NAME = "tracing" -RELATION_INTERFACE_NAME = "tracing" - -# Supported list rationale https://github.com/canonical/tempo-coordinator-k8s-operator/issues/8 -ReceiverProtocol = Literal[ - "zipkin", - "otlp_grpc", - "otlp_http", - "jaeger_grpc", - "jaeger_thrift_http", -] - -RawReceiver = Tuple[ReceiverProtocol, str] -"""Helper type. A raw receiver is defined as a tuple consisting of the protocol name, and the (external, if available), -(secured, if available) resolvable server url. -""" - -BUILTIN_JUJU_KEYS = {"ingress-address", "private-address", "egress-subnets"} - - -class TransportProtocolType(str, enum.Enum): - """Receiver Type.""" - - http = "http" - grpc = "grpc" - - -receiver_protocol_to_transport_protocol: Dict[ReceiverProtocol, TransportProtocolType] = { - "zipkin": TransportProtocolType.http, - "otlp_grpc": TransportProtocolType.grpc, - "otlp_http": TransportProtocolType.http, - "jaeger_thrift_http": TransportProtocolType.http, - "jaeger_grpc": TransportProtocolType.grpc, -} -"""A mapping between telemetry protocols and their corresponding transport protocol. -""" - - -class TracingError(Exception): - """Base class for custom errors raised by this library.""" - - -class NotReadyError(TracingError): - """Raised by the provider wrapper if a requirer hasn't published the required data (yet).""" - - -class ProtocolNotRequestedError(TracingError): - """Raised if the user attempts to obtain an endpoint for a protocol it did not request.""" - - -class DataValidationError(TracingError): - """Raised when data validation fails on IPU relation data.""" - - -class AmbiguousRelationUsageError(TracingError): - """Raised when one wrongly assumes that there can only be one relation on an endpoint.""" - - -if int(pydantic.version.VERSION.split(".")[0]) < 2: - - class DatabagModel(BaseModel): # type: ignore - """Base databag model.""" - - class Config: - """Pydantic config.""" - - # ignore any extra fields in the databag - extra = "ignore" - """Ignore any extra fields in the databag.""" - allow_population_by_field_name = True - """Allow instantiating this class by field name (instead of forcing alias).""" - - _NEST_UNDER = None - - @classmethod - def load(cls, databag: MutableMapping): - """Load this model from a Juju databag.""" - if cls._NEST_UNDER: - return cls.parse_obj(json.loads(databag[cls._NEST_UNDER])) - - try: - data = { - k: json.loads(v) - for k, v in databag.items() - # Don't attempt to parse model-external values - if k in {f.alias for f in cls.__fields__.values()} - } - except json.JSONDecodeError as e: - msg = f"invalid databag contents: expecting json. {databag}" - logger.error(msg) - raise DataValidationError(msg) from e - - try: - return cls.parse_raw(json.dumps(data)) # type: ignore - except pydantic.ValidationError as e: - msg = f"failed to validate databag: {databag}" - logger.debug(msg, exc_info=True) - raise DataValidationError(msg) from e - - def dump(self, databag: Optional[MutableMapping] = None, clear: bool = True): - """Write the contents of this model to Juju databag. - - :param databag: the databag to write the data to. - :param clear: ensure the databag is cleared before writing it. - """ - if clear and databag: - databag.clear() - - if databag is None: - databag = {} - - if self._NEST_UNDER: - databag[self._NEST_UNDER] = self.json(by_alias=True) - return databag - - dct = self.dict() - for key, field in self.__fields__.items(): # type: ignore - value = dct[key] - databag[field.alias or key] = json.dumps(value) - - return databag - -else: - from pydantic import ConfigDict - - class DatabagModel(BaseModel): - """Base databag model.""" - - model_config = ConfigDict( - # ignore any extra fields in the databag - extra="ignore", - # Allow instantiating this class by field name (instead of forcing alias). - populate_by_name=True, - # Custom config key: whether to nest the whole datastructure (as json) - # under a field or spread it out at the toplevel. - _NEST_UNDER=None, # type: ignore - ) - """Pydantic config.""" - - @classmethod - def load(cls, databag: MutableMapping): - """Load this model from a Juju databag.""" - nest_under = cls.model_config.get("_NEST_UNDER") # type: ignore - if nest_under: - return cls.model_validate(json.loads(databag[nest_under])) # type: ignore - - try: - data = { - k: json.loads(v) - for k, v in databag.items() - # Don't attempt to parse model-external values - if k in {(f.alias or n) for n, f in cls.__fields__.items()} - } - except json.JSONDecodeError as e: - msg = f"invalid databag contents: expecting json. {databag}" - logger.error(msg) - raise DataValidationError(msg) from e - - try: - return cls.model_validate_json(json.dumps(data)) # type: ignore - except pydantic.ValidationError as e: - msg = f"failed to validate databag: {databag}" - logger.debug(msg, exc_info=True) - raise DataValidationError(msg) from e - - def dump(self, databag: Optional[MutableMapping] = None, clear: bool = True): - """Write the contents of this model to Juju databag. - - :param databag: the databag to write the data to. - :param clear: ensure the databag is cleared before writing it. - """ - if clear and databag: - databag.clear() - - if databag is None: - databag = {} - nest_under = self.model_config.get("_NEST_UNDER") - if nest_under: - databag[nest_under] = self.model_dump_json( # type: ignore - by_alias=True, - # skip keys whose values are default - exclude_defaults=True, - ) - return databag - - dct = self.model_dump() # type: ignore - for key, field in self.model_fields.items(): # type: ignore - value = dct[key] - if value == field.default: - continue - databag[field.alias or key] = json.dumps(value) - - return databag - - -# todo use models from charm-relation-interfaces -if int(pydantic.version.VERSION.split(".")[0]) < 2: - - class ProtocolType(BaseModel): # type: ignore - """Protocol Type.""" - - class Config: - """Pydantic config.""" - - use_enum_values = True - """Allow serializing enum values.""" - - name: str = Field( - ..., - description="Receiver protocol name. What protocols are supported (and what they are called) " - "may differ per provider.", - examples=["otlp_grpc", "otlp_http", "tempo_http"], - ) - - type: TransportProtocolType = Field( - ..., - description="The transport protocol used by this receiver.", - examples=["http", "grpc"], - ) - -else: - - class ProtocolType(BaseModel): - """Protocol Type.""" - - model_config = ConfigDict( # type: ignore - # Allow serializing enum values. - use_enum_values=True - ) - """Pydantic config.""" - - name: str = Field( - ..., - description="Receiver protocol name. What protocols are supported (and what they are called) " - "may differ per provider.", - examples=["otlp_grpc", "otlp_http", "tempo_http"], - ) - - type: TransportProtocolType = Field( - ..., - description="The transport protocol used by this receiver.", - examples=["http", "grpc"], - ) - - -class Receiver(BaseModel): - """Specification of an active receiver.""" - - protocol: ProtocolType = Field(..., description="Receiver protocol name and type.") - url: str = Field( - ..., - description="""URL at which the receiver is reachable. If there's an ingress, it would be the external URL. - Otherwise, it would be the service's fqdn or internal IP. - If the protocol type is grpc, the url will not contain a scheme.""", - examples=[ - "http://traefik_address:2331", - "https://traefik_address:2331", - "http://tempo_public_ip:2331", - "https://tempo_public_ip:2331", - "tempo_public_ip:2331", - ], - ) - - -class TracingProviderAppData(DatabagModel): # noqa: D101 - """Application databag model for the tracing provider.""" - - receivers: List[Receiver] = Field( - ..., - description="List of all receivers enabled on the tracing provider.", - ) - - -class TracingRequirerAppData(DatabagModel): # noqa: D101 - """Application databag model for the tracing requirer.""" - - receivers: List[ReceiverProtocol] - """Requested receivers.""" - - -class _AutoSnapshotEvent(RelationEvent): - __args__: Tuple[str, ...] = () - __optional_kwargs__: Dict[str, Any] = {} - - @classmethod - def __attrs__(cls): - return cls.__args__ + tuple(cls.__optional_kwargs__.keys()) - - def __init__(self, handle, relation, *args, **kwargs): - super().__init__(handle, relation) - - if not len(self.__args__) == len(args): - raise TypeError("expected {} args, got {}".format(len(self.__args__), len(args))) - - for attr, obj in zip(self.__args__, args): - setattr(self, attr, obj) - for attr, default in self.__optional_kwargs__.items(): - obj = kwargs.get(attr, default) - setattr(self, attr, obj) - - def snapshot(self) -> dict: - dct = super().snapshot() - for attr in self.__attrs__(): - obj = getattr(self, attr) - try: - dct[attr] = obj - except ValueError as e: - raise ValueError( - "cannot automagically serialize {}: " - "override this method and do it " - "manually.".format(obj) - ) from e - - return dct - - def restore(self, snapshot: dict) -> None: - super().restore(snapshot) - for attr, obj in snapshot.items(): - setattr(self, attr, obj) - - -class RelationNotFoundError(Exception): - """Raised if no relation with the given name is found.""" - - def __init__(self, relation_name: str): - self.relation_name = relation_name - self.message = "No relation named '{}' found".format(relation_name) - super().__init__(self.message) - - -class RelationInterfaceMismatchError(Exception): - """Raised if the relation with the given name has an unexpected interface.""" - - def __init__( - self, - relation_name: str, - expected_relation_interface: str, - actual_relation_interface: str, - ): - self.relation_name = relation_name - self.expected_relation_interface = expected_relation_interface - self.actual_relation_interface = actual_relation_interface - self.message = ( - "The '{}' relation has '{}' as interface rather than the expected '{}'".format( - relation_name, actual_relation_interface, expected_relation_interface - ) - ) - - super().__init__(self.message) - - -class RelationRoleMismatchError(Exception): - """Raised if the relation with the given name has a different role than expected.""" - - def __init__( - self, - relation_name: str, - expected_relation_role: RelationRole, - actual_relation_role: RelationRole, - ): - self.relation_name = relation_name - self.expected_relation_interface = expected_relation_role - self.actual_relation_role = actual_relation_role - self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format( - relation_name, repr(actual_relation_role), repr(expected_relation_role) - ) - - super().__init__(self.message) - - -def _validate_relation_by_interface_and_direction( - charm: CharmBase, - relation_name: str, - expected_relation_interface: str, - expected_relation_role: RelationRole, -): - """Validate a relation. - - Verifies that the `relation_name` provided: (1) exists in metadata.yaml, - (2) declares as interface the interface name passed as `relation_interface` - and (3) has the right "direction", i.e., it is a relation that `charm` - provides or requires. - - Args: - charm: a `CharmBase` object to scan for the matching relation. - relation_name: the name of the relation to be verified. - expected_relation_interface: the interface name to be matched by the - relation named `relation_name`. - expected_relation_role: whether the `relation_name` must be either - provided or required by `charm`. - - Raises: - RelationNotFoundError: If there is no relation in the charm's metadata.yaml - with the same name as provided via `relation_name` argument. - RelationInterfaceMismatchError: The relation with the same name as provided - via `relation_name` argument does not have the same relation interface - as specified via the `expected_relation_interface` argument. - RelationRoleMismatchError: If the relation with the same name as provided - via `relation_name` argument does not have the same role as specified - via the `expected_relation_role` argument. - """ - if relation_name not in charm.meta.relations: - raise RelationNotFoundError(relation_name) - - relation = charm.meta.relations[relation_name] - - # fixme: why do we need to cast here? - actual_relation_interface = cast(str, relation.interface_name) - - if actual_relation_interface != expected_relation_interface: - raise RelationInterfaceMismatchError( - relation_name, expected_relation_interface, actual_relation_interface - ) - - if expected_relation_role is RelationRole.provides: - if relation_name not in charm.meta.provides: - raise RelationRoleMismatchError( - relation_name, RelationRole.provides, RelationRole.requires - ) - elif expected_relation_role is RelationRole.requires: - if relation_name not in charm.meta.requires: - raise RelationRoleMismatchError( - relation_name, RelationRole.requires, RelationRole.provides - ) - else: - raise TypeError("Unexpected RelationDirection: {}".format(expected_relation_role)) - - -class RequestEvent(RelationEvent): - """Event emitted when a remote requests a tracing endpoint.""" - - @property - def requested_receivers(self) -> List[ReceiverProtocol]: - """List of receiver protocols that have been requested.""" - relation = self.relation - app = relation.app - if not app: - raise NotReadyError("relation.app is None") - - return TracingRequirerAppData.load(relation.data[app]).receivers - - -class BrokenEvent(RelationBrokenEvent): - """Event emitted when a relation on tracing is broken.""" - - -class TracingEndpointProviderEvents(CharmEvents): - """TracingEndpointProvider events.""" - - request = EventSource(RequestEvent) - broken = EventSource(BrokenEvent) - - -class TracingEndpointProvider(Object): - """Class representing a trace receiver service.""" - - on = TracingEndpointProviderEvents() # type: ignore - - def __init__( - self, - charm: CharmBase, - external_url: Optional[str] = None, - relation_name: str = DEFAULT_RELATION_NAME, - ): - """Initialize. - - Args: - charm: a `CharmBase` instance that manages this instance of the Tempo service. - external_url: external address of the node hosting the tempo server, - if an ingress is present. - relation_name: an optional string name of the relation between `charm` - and the Tempo charmed service. The default is "tracing". - - Raises: - RelationNotFoundError: If there is no relation in the charm's metadata.yaml - with the same name as provided via `relation_name` argument. - RelationInterfaceMismatchError: The relation with the same name as provided - via `relation_name` argument does not have the `tracing` relation - interface. - RelationRoleMismatchError: If the relation with the same name as provided - via `relation_name` argument does not have the `RelationRole.requires` - role. - """ - _validate_relation_by_interface_and_direction( - charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides - ) - - super().__init__(charm, relation_name + "tracing-provider") - self._charm = charm - self._external_url = external_url - self._relation_name = relation_name - self.framework.observe( - self._charm.on[relation_name].relation_joined, self._on_relation_event - ) - self.framework.observe( - self._charm.on[relation_name].relation_created, self._on_relation_event - ) - self.framework.observe( - self._charm.on[relation_name].relation_changed, self._on_relation_event - ) - self.framework.observe( - self._charm.on[relation_name].relation_broken, self._on_relation_broken_event - ) - - def _on_relation_broken_event(self, e: RelationBrokenEvent): - """Handle relation broken events.""" - self.on.broken.emit(e.relation) - - def _on_relation_event(self, e: RelationEvent): - """Handle relation created/joined/changed events.""" - if self.is_requirer_ready(e.relation): - self.on.request.emit(e.relation) - - def is_requirer_ready(self, relation: Relation): - """Attempt to determine if requirer has already populated app data.""" - try: - self._get_requested_protocols(relation) - except NotReadyError: - return False - return True - - @staticmethod - def _get_requested_protocols(relation: Relation): - app = relation.app - if not app: - raise NotReadyError("relation.app is None") - - try: - databag = TracingRequirerAppData.load(relation.data[app]) - except (json.JSONDecodeError, pydantic.ValidationError, DataValidationError): - logger.info(f"relation {relation} is not ready to talk tracing") - raise NotReadyError() - return databag.receivers - - def requested_protocols(self): - """All receiver protocols that have been requested by our related apps.""" - requested_protocols = set() - for relation in self.relations: - try: - protocols = self._get_requested_protocols(relation) - except NotReadyError: - continue - requested_protocols.update(protocols) - return requested_protocols - - @property - def relations(self) -> List[Relation]: - """All relations active on this endpoint.""" - return self._charm.model.relations[self._relation_name] - - def publish_receivers(self, receivers: Sequence[RawReceiver]): - """Let all requirers know that these receivers are active and listening.""" - if not self._charm.unit.is_leader(): - raise RuntimeError("only leader can do this") - - for relation in self.relations: - try: - TracingProviderAppData( - receivers=[ - Receiver( - url=url, - protocol=ProtocolType( - name=protocol, - type=receiver_protocol_to_transport_protocol[protocol], - ), - ) - for protocol, url in receivers - ], - ).dump(relation.data[self._charm.app]) - - except ModelError as e: - # args are bytes - msg = e.args[0] - if isinstance(msg, bytes): - if msg.startswith( - b"ERROR cannot read relation application settings: permission denied" - ): - logger.error( - f"encountered error {e} while attempting to update_relation_data." - f"The relation must be gone." - ) - continue - raise - - -class EndpointRemovedEvent(RelationBrokenEvent): - """Event representing a change in one of the receiver endpoints.""" - - -class EndpointChangedEvent(_AutoSnapshotEvent): - """Event representing a change in one of the receiver endpoints.""" - - __args__ = ("_receivers",) - - if TYPE_CHECKING: - _receivers = [] # type: List[dict] - - @property - def receivers(self) -> List[Receiver]: - """Cast receivers back from dict.""" - return [Receiver(**i) for i in self._receivers] - - -class TracingEndpointRequirerEvents(CharmEvents): - """TracingEndpointRequirer events.""" - - endpoint_changed = EventSource(EndpointChangedEvent) - endpoint_removed = EventSource(EndpointRemovedEvent) - - -class TracingEndpointRequirer(Object): - """A tracing endpoint for Tempo.""" - - on = TracingEndpointRequirerEvents() # type: ignore - - def __init__( - self, - charm: CharmBase, - relation_name: str = DEFAULT_RELATION_NAME, - protocols: Optional[List[ReceiverProtocol]] = None, - ): - """Construct a tracing requirer for a Tempo charm. - - If your application supports pushing traces to a distributed tracing backend, the - `TracingEndpointRequirer` object enables your charm to easily access endpoint information - exchanged over a `tracing` relation interface. - - Args: - charm: a `CharmBase` object that manages this - `TracingEndpointRequirer` object. Typically, this is `self` in the instantiating - class. - relation_name: an optional string name of the relation between `charm` - and the Tempo charmed service. The default is "tracing". It is strongly - advised not to change the default, so that people deploying your charm will have a - consistent experience with all other charms that provide tracing endpoints. - protocols: optional list of protocols that the charm intends to send traces with. - The provider will enable receivers for these and only these protocols, - so be sure to enable all protocols the charm or its workload are going to need. - - Raises: - RelationNotFoundError: If there is no relation in the charm's metadata.yaml - with the same name as provided via `relation_name` argument. - RelationInterfaceMismatchError: The relation with the same name as provided - via `relation_name` argument does not have the `tracing` relation - interface. - RelationRoleMismatchError: If the relation with the same name as provided - via `relation_name` argument does not have the `RelationRole.provides` - role. - """ - _validate_relation_by_interface_and_direction( - charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires - ) - - super().__init__(charm, relation_name) - - self._is_single_endpoint = charm.meta.relations[relation_name].limit == 1 - - self._charm = charm - self._relation_name = relation_name - - events = self._charm.on[self._relation_name] - self.framework.observe(events.relation_changed, self._on_tracing_relation_changed) - self.framework.observe(events.relation_broken, self._on_tracing_relation_broken) - - if protocols: - self.request_protocols(protocols) - - def request_protocols( - self, protocols: Sequence[ReceiverProtocol], relation: Optional[Relation] = None - ): - """Publish the list of protocols which the provider should activate.""" - # todo: should we check if _is_single_endpoint and len(self.relations) > 1 and raise, here? - relations = [relation] if relation else self.relations - - if not protocols: - # empty sequence - raise ValueError( - "You need to pass a nonempty sequence of protocols to `request_protocols`." - ) - - try: - if self._charm.unit.is_leader(): - for relation in relations: - TracingRequirerAppData( - receivers=list(protocols), - ).dump(relation.data[self._charm.app]) - - except ModelError as e: - # args are bytes - msg = e.args[0] - if isinstance(msg, bytes): - if msg.startswith( - b"ERROR cannot read relation application settings: permission denied" - ): - logger.error( - f"encountered error {e} while attempting to request_protocols." - f"The relation must be gone." - ) - return - raise - - @property - def relations(self) -> List[Relation]: - """The tracing relations associated with this endpoint.""" - return self._charm.model.relations[self._relation_name] - - @property - def _relation(self) -> Optional[Relation]: - """If this wraps a single endpoint, the relation bound to it, if any.""" - if not self._is_single_endpoint: - objname = type(self).__name__ - raise AmbiguousRelationUsageError( - f"This {objname} wraps a {self._relation_name} endpoint that has " - "limit != 1. We can't determine what relation, of the possibly many, you are " - f"talking about. Please pass a relation instance while calling {objname}, " - "or set limit=1 in the charm metadata." - ) - relations = self.relations - return relations[0] if relations else None - - def is_ready(self, relation: Optional[Relation] = None): - """Is this endpoint ready?""" - relation = relation or self._relation - if not relation: - logger.debug(f"no relation on {self._relation_name !r}: tracing not ready") - return False - if relation.data is None: - logger.error(f"relation data is None for {relation}") - return False - if not relation.app: - logger.error(f"{relation} event received but there is no relation.app") - return False - try: - databag = dict(relation.data[relation.app]) - TracingProviderAppData.load(databag) - - except (json.JSONDecodeError, pydantic.ValidationError, DataValidationError): - logger.info(f"failed validating relation data for {relation}") - return False - return True - - def _on_tracing_relation_changed(self, event): - """Notify the providers that there is new endpoint information available.""" - relation = event.relation - if not self.is_ready(relation): - self.on.endpoint_removed.emit(relation) # type: ignore - return - - data = TracingProviderAppData.load(relation.data[relation.app]) - self.on.endpoint_changed.emit(relation, [i.dict() for i in data.receivers]) # type: ignore - - def _on_tracing_relation_broken(self, event: RelationBrokenEvent): - """Notify the providers that the endpoint is broken.""" - relation = event.relation - self.on.endpoint_removed.emit(relation) # type: ignore - - def get_all_endpoints( - self, relation: Optional[Relation] = None - ) -> Optional[TracingProviderAppData]: - """Unmarshalled relation data.""" - relation = relation or self._relation - if not self.is_ready(relation): - return - return TracingProviderAppData.load(relation.data[relation.app]) # type: ignore - - def _get_endpoint( - self, relation: Optional[Relation], protocol: ReceiverProtocol - ) -> Optional[str]: - app_data = self.get_all_endpoints(relation) - if not app_data: - return None - receivers: List[Receiver] = list( - filter(lambda i: i.protocol.name == protocol, app_data.receivers) - ) - if not receivers: - logger.error(f"no receiver found with protocol={protocol!r}") - return - if len(receivers) > 1: - logger.error( - f"too many receivers with protocol={protocol!r}; using first one. Found: {receivers}" - ) - return - - receiver = receivers[0] - return receiver.url - - def get_endpoint( - self, protocol: ReceiverProtocol, relation: Optional[Relation] = None - ) -> Optional[str]: - """Receiver endpoint for the given protocol. - - It could happen that this function gets called before the provider publishes the endpoints. - In such a scenario, if a non-leader unit calls this function, a permission denied exception will be raised due to - restricted access. To prevent this, this function needs to be guarded by the `is_ready` check. - - Raises: - ProtocolNotRequestedError: - If the charm unit is the leader unit and attempts to obtain an endpoint for a protocol it did not request. - """ - endpoint = self._get_endpoint(relation or self._relation, protocol=protocol) - if not endpoint: - requested_protocols = set() - relations = [relation] if relation else self.relations - for relation in relations: - try: - databag = TracingRequirerAppData.load(relation.data[self._charm.app]) - except DataValidationError: - continue - - requested_protocols.update(databag.receivers) - - if protocol not in requested_protocols: - raise ProtocolNotRequestedError(protocol, relation) - - return None - return endpoint - - -def charm_tracing_config( - endpoint_requirer: TracingEndpointRequirer, cert_path: Optional[Union[Path, str]] -) -> Tuple[Optional[str], Optional[str]]: - """Return the charm_tracing config you likely want. - - If no endpoint is provided: - disable charm tracing. - If https endpoint is provided but cert_path is not found on disk: - disable charm tracing. - If https endpoint is provided and cert_path is None: - ERROR - Else: - proceed with charm tracing (with or without tls, as appropriate) - - Usage: - If you are using charm_tracing >= v1.9: - >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm - >>> from lib.charms.tempo_k8s.v2.tracing import charm_tracing_config - >>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path") - >>> class MyCharm(...): - >>> _cert_path = "/path/to/cert/on/charm/container.crt" - >>> def __init__(self, ...): - >>> self.tracing = TracingEndpointRequirer(...) - >>> self.my_endpoint, self.cert_path = charm_tracing_config( - ... self.tracing, self._cert_path) - - If you are using charm_tracing < v1.9: - >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm - >>> from lib.charms.tempo_k8s.v2.tracing import charm_tracing_config - >>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path") - >>> class MyCharm(...): - >>> _cert_path = "/path/to/cert/on/charm/container.crt" - >>> def __init__(self, ...): - >>> self.tracing = TracingEndpointRequirer(...) - >>> self._my_endpoint, self._cert_path = charm_tracing_config( - ... self.tracing, self._cert_path) - >>> @property - >>> def my_endpoint(self): - >>> return self._my_endpoint - >>> @property - >>> def cert_path(self): - >>> return self._cert_path - - """ - if not endpoint_requirer.is_ready(): - return None, None - - endpoint = endpoint_requirer.get_endpoint("otlp_http") - if not endpoint: - return None, None - - is_https = endpoint.startswith("https://") - - if is_https: - if cert_path is None: - raise TracingError("Cannot send traces to an https endpoint without a certificate.") - elif not Path(cert_path).exists(): - # if endpoint is https BUT we don't have a server_cert yet: - # disable charm tracing until we do to prevent tls errors - return None, None - return endpoint, str(cert_path) - else: - return endpoint, None diff --git a/lib/charms/tls_certificates_interface/v3/tls_certificates.py b/lib/charms/tls_certificates_interface/v3/tls_certificates.py index da7fa95..141412b 100644 --- a/lib/charms/tls_certificates_interface/v3/tls_certificates.py +++ b/lib/charms/tls_certificates_interface/v3/tls_certificates.py @@ -318,7 +318,7 @@ def _on_all_certificates_invalidated(self, event: AllCertificatesInvalidatedEven # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 20 +LIBPATCH = 23 PYDEPS = ["cryptography", "jsonschema"] @@ -1902,10 +1902,20 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None: ) else: try: + secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}") logger.debug( "Setting secret with label %s", f"{LIBID}-{csr_in_sha256_hex}" ) - secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}") + # Juju < 3.6 will create a new revision even if the content is the same + if ( + secret.get_content(refresh=True).get("certificate", "") + == certificate.certificate + ): + logger.debug( + "Secret %s with correct certificate already exists", + f"{LIBID}-{csr_in_sha256_hex}", + ) + continue secret.set_content( {"certificate": certificate.certificate, "csr": certificate.csr} ) @@ -1986,11 +1996,19 @@ def _on_secret_expired(self, event: SecretExpiredEvent) -> None: provider_certificate = self._find_certificate_in_relation_data(csr) if not provider_certificate: # A secret expired but we did not find matching certificate. Cleaning up + logger.warning( + "Failed to find matching certificate for csr, cleaning up secret %s", + event.secret.label, + ) event.secret.remove_all_revisions() return if not provider_certificate.expiry_time: # A secret expired but matching certificate is invalid. Cleaning up + logger.warning( + "Certificate matching csr is invalid, cleaning up secret %s", + event.secret.label, + ) event.secret.remove_all_revisions() return @@ -2023,14 +2041,18 @@ def _find_certificate_in_relation_data(self, csr: str) -> Optional[ProviderCerti return provider_certificate return None - def _get_csr_from_secret(self, secret: Secret) -> str: + def _get_csr_from_secret(self, secret: Secret) -> Union[str, None]: """Extract the CSR from the secret label or content. This function is a workaround to maintain backwards compatibility and fix the issue reported in https://github.com/canonical/tls-certificates-interface/issues/228 """ - if not (csr := secret.get_content().get("csr", "")): + try: + content = secret.get_content(refresh=True) + except SecretNotFoundError: + return None + if not (csr := content.get("csr", None)): # In versions <14 of the Lib we were storing the CSR in the label of the secret # The CSR now is stored int the content of the secret, which was a breaking change # Here we get the CSR if the secret was created by an app using libpatch 14 or lower diff --git a/src/charm.py b/src/charm.py index f052d10..fa14665 100755 --- a/src/charm.py +++ b/src/charm.py @@ -12,7 +12,7 @@ import yaml from charms.loki_k8s.v1.loki_push_api import LokiPushApiProvider from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointConsumer -from charms.tempo_k8s.v1.charm_tracing import trace_charm +from charms.tempo_coordinator_k8s.v0.charm_tracing import trace_charm from cosl import GrafanaDashboard from ops.main import main from ops.pebble import Layer diff --git a/src/grafana_agent.py b/src/grafana_agent.py index a4a3a33..2758658 100644 --- a/src/grafana_agent.py +++ b/src/grafana_agent.py @@ -33,7 +33,7 @@ from charms.prometheus_k8s.v1.prometheus_remote_write import ( PrometheusRemoteWriteConsumer, ) -from charms.tempo_k8s.v2.tracing import ( +from charms.tempo_coordinator_k8s.v0.tracing import ( ReceiverProtocol, TracingEndpointProvider, TracingEndpointRequirer, diff --git a/tests/scenario/test_tracing_integration.py b/tests/scenario/test_tracing_integration.py index 8d8dd47..6175631 100644 --- a/tests/scenario/test_tracing_integration.py +++ b/tests/scenario/test_tracing_integration.py @@ -3,8 +3,12 @@ import pytest import yaml -from charms.tempo_k8s.v1.charm_tracing import charm_tracing_disabled -from charms.tempo_k8s.v2.tracing import Receiver, TracingProviderAppData, TracingRequirerAppData +from charms.tempo_coordinator_k8s.v0.charm_tracing import charm_tracing_disabled +from charms.tempo_coordinator_k8s.v0.tracing import ( + Receiver, + TracingProviderAppData, + TracingRequirerAppData, +) from ops import pebble from ops.testing import Container, Context, Relation, State