diff --git a/config.yaml b/config.yaml index 7a9f482..2ed9b4b 100644 --- a/config.yaml +++ b/config.yaml @@ -10,3 +10,33 @@ options: Loki). type: boolean default: false + always_enable_zipkin: + description: > + Force-enable the receiver for the 'zipkin' protocol in Grafana Agent, + even if there is no integration currently requesting it. + type: boolean + default: false + always_enable_otlp_grpc: + description: > + Force-enable the receiver for the 'otlp_grpc' protocol in Grafana Agent, + even if there is no integration currently requesting it. + type: boolean + default: false + always_enable_otlp_http: + description: > + Force-enable the receiver for the 'otlp_http' protocol in Grafana Agent, + even if there is no integration currently requesting it. + type: boolean + default: false + always_enable_jaeger_grpc: + description: > + Force-enable the receiver for the 'jaeger_grpc' protocol in Grafana Agent, + even if there is no integration currently requesting it. + type: boolean + default: false + always_enable_jaeger_thrift_http: + description: > + Force-enable the receiver for the 'jaeger_thrift_http' protocol in Grafana Agent, + even if there is no integration currently requesting it. + type: boolean + default: false \ No newline at end of file diff --git a/lib/charms/certificate_transfer_interface/v0/certificate_transfer.py b/lib/charms/certificate_transfer_interface/v0/certificate_transfer.py index b07b835..caa6e22 100644 --- a/lib/charms/certificate_transfer_interface/v0/certificate_transfer.py +++ b/lib/charms/certificate_transfer_interface/v0/certificate_transfer.py @@ -96,7 +96,6 @@ def _on_certificate_removed(self, event: CertificateRemovedEvent): """ - import json import logging from typing import List, Mapping @@ -113,7 +112,7 @@ def _on_certificate_removed(self, event: CertificateRemovedEvent): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 7 +LIBPATCH = 8 PYDEPS = ["jsonschema"] diff --git a/lib/charms/grafana_cloud_integrator/v0/cloud_config_requirer.py b/lib/charms/grafana_cloud_integrator/v0/cloud_config_requirer.py index 6e01c26..7228a40 100644 --- a/lib/charms/grafana_cloud_integrator/v0/cloud_config_requirer.py +++ b/lib/charms/grafana_cloud_integrator/v0/cloud_config_requirer.py @@ -6,7 +6,7 @@ LIBID = "e6f580481c1b4388aa4d2cdf412a47fa" LIBAPI = 0 -LIBPATCH = 4 +LIBPATCH = 5 DEFAULT_RELATION_NAME = "grafana-cloud-config" @@ -121,12 +121,16 @@ def loki_endpoint(self) -> dict: def prometheus_ready(self): return self._is_not_empty(self.prometheus_url) + @property + def tempo_ready(self): + return self._is_not_empty(self.tempo_url) + @property def prometheus_endpoint(self) -> dict: """Return the prometheus endpoint dict.""" if not self.prometheus_ready: return {} - + endpoint = {} endpoint["url"] = self.prometheus_url if self.credentials: @@ -134,11 +138,15 @@ def prometheus_endpoint(self) -> dict: return endpoint @property - def loki_url(self): + def loki_url(self) -> str: return self._data.get("loki_url", "") @property - def prometheus_url(self): + def tempo_url(self) -> str: + return self._data.get("tempo_url", "") + + @property + def prometheus_url(self) -> str: return self._data.get("prometheus_url", "") @property diff --git a/lib/charms/grafana_k8s/v0/grafana_dashboard.py b/lib/charms/grafana_k8s/v0/grafana_dashboard.py index 1f1bc4f..dfc32dd 100644 --- a/lib/charms/grafana_k8s/v0/grafana_dashboard.py +++ b/lib/charms/grafana_k8s/v0/grafana_dashboard.py @@ -219,7 +219,7 @@ def __init__(self, *args): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 35 +LIBPATCH = 36 logger = logging.getLogger(__name__) @@ -1050,6 +1050,7 @@ def __init__( self.framework.observe(self._charm.on.leader_elected, self._update_all_dashboards_from_dir) self.framework.observe(self._charm.on.upgrade_charm, self._update_all_dashboards_from_dir) + self.framework.observe(self._charm.on.config_changed, self._update_all_dashboards_from_dir) self.framework.observe( self._charm.on[self._relation_name].relation_created, diff --git a/lib/charms/loki_k8s/v1/loki_push_api.py b/lib/charms/loki_k8s/v1/loki_push_api.py index bbae054..7f8372c 100644 --- a/lib/charms/loki_k8s/v1/loki_push_api.py +++ b/lib/charms/loki_k8s/v1/loki_push_api.py @@ -16,20 +16,24 @@ send log to Loki by implementing the consumer side of the `loki_push_api` relation interface. For instance, a Promtail or Grafana agent charm which needs to send logs to Loki. -- `LogProxyConsumer`: This object can be used by any Charmed Operator which needs to -send telemetry, such as logs, to Loki through a Log Proxy by implementing the consumer side of the -`loki_push_api` relation interface. +- `LogProxyConsumer`: DEPRECATED. +This object can be used by any Charmed Operator which needs to send telemetry, such as logs, to +Loki through a Log Proxy by implementing the consumer side of the `loki_push_api` relation +interface. +In order to be able to control the labels on the logs pushed this object adds a Pebble layer +that runs Promtail in the workload container, injecting Juju topology labels into the +logs on the fly. +This object is deprecated. Consider migrating to LogForwarder with the release of Juju 3.6 LTS. - `LogForwarder`: This object can be used by any Charmed Operator which needs to send the workload standard output (stdout) through Pebble's log forwarding mechanism, to Loki endpoints through the `loki_push_api` relation interface. +In order to be able to control the labels on the logs pushed this object updates the pebble layer's +"log-targets" section with Juju topology. Filtering logs in Loki is largely performed on the basis of labels. In the Juju ecosystem, Juju topology labels are used to uniquely identify the workload which generates telemetry like logs. -In order to be able to control the labels on the logs pushed this object adds a Pebble layer -that runs Promtail in the workload container, injecting Juju topology labels into the -logs on the fly. ## LokiPushApiProvider Library Usage @@ -42,13 +46,14 @@ - `charm`: A reference to the parent (Loki) charm. - `relation_name`: The name of the relation that the charm uses to interact - with its clients, which implement `LokiPushApiConsumer` or `LogProxyConsumer`. + with its clients, which implement `LokiPushApiConsumer` `LogForwarder`, or `LogProxyConsumer` + (note that LogProxyConsumer is deprecated). If provided, this relation name must match a provided relation in metadata.yaml with the `loki_push_api` interface. - The default relation name is "logging" for `LokiPushApiConsumer` and "log-proxy" for - `LogProxyConsumer`. + The default relation name is "logging" for `LokiPushApiConsumer` and `LogForwarder`, and + "log-proxy" for `LogProxyConsumer` (note that LogProxyConsumer is deprecated). For example, a provider's `metadata.yaml` file may look as follows: @@ -223,6 +228,9 @@ def __init__(self, *args): ## LogProxyConsumer Library Usage +> Note: This object is deprecated. Consider migrating to LogForwarder with the release of Juju 3.6 +> LTS. + Let's say that we have a workload charm that produces logs, and we need to send those logs to a workload implementing the `loki_push_api` interface, such as `Loki` or `Grafana Agent`. @@ -519,7 +527,7 @@ def _alert_rules_error(self, event): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 8 +LIBPATCH = 12 PYDEPS = ["cosl"] @@ -534,13 +542,21 @@ def _alert_rules_error(self, event): # To update Promtail version you only need to change the PROMTAIL_VERSION and # update all sha256 sums in PROMTAIL_BINARIES. To support a new architecture # you only need to add a new key value pair for the architecture in PROMTAIL_BINARIES. -PROMTAIL_VERSION = "v2.5.0" +PROMTAIL_VERSION = "v2.9.7" +PROMTAIL_ARM_BINARY = { + "filename": "promtail-static-arm64", + "zipsha": "c083fdb45e5c794103f974eeb426489b4142438d9e10d0ae272b2aff886e249b", + "binsha": "4cd055c477a301c0bdfdbcea514e6e93f6df5d57425ce10ffc77f3e16fec1ddf", +} + PROMTAIL_BINARIES = { "amd64": { "filename": "promtail-static-amd64", - "zipsha": "543e333b0184e14015a42c3c9e9e66d2464aaa66eca48b29e185a6a18f67ab6d", - "binsha": "17e2e271e65f793a9fbe81eab887b941e9d680abe82d5a0602888c50f5e0cac9", + "zipsha": "6873cbdabf23062aeefed6de5f00ff382710332af3ab90a48c253ea17e08f465", + "binsha": "28da9b99f81296fe297831f3bc9d92aea43b4a92826b8ff04ba433b8cb92fb50", }, + "arm64": PROMTAIL_ARM_BINARY, + "aarch64": PROMTAIL_ARM_BINARY, } # Paths in `charm` container @@ -1585,7 +1601,8 @@ def __init__( the Loki API endpoint to push logs. It is intended for workloads that can speak loki_push_api (https://grafana.com/docs/loki/latest/api/#push-log-entries-to-loki), such as grafana-agent. - (If you only need to forward a few workload log files, then use LogProxyConsumer.) + (If you need to forward workload stdout logs, then use LogForwarder; if you need to forward + log files, then use LogProxyConsumer.) `LokiPushApiConsumer` can be instantiated as follows: @@ -1760,6 +1777,9 @@ class LogProxyEvents(ObjectEvents): class LogProxyConsumer(ConsumerBase): """LogProxyConsumer class. + > Note: This object is deprecated. Consider migrating to LogForwarder with the release of Juju + > 3.6 LTS. + The `LogProxyConsumer` object provides a method for attaching `promtail` to a workload in order to generate structured logging data from applications which traditionally log to syslog or do not have native Loki integration. @@ -1831,7 +1851,12 @@ def __init__( # architecture used for promtail binary arch = platform.processor() - self._arch = "amd64" if arch == "x86_64" else arch + if arch in ["x86_64", "amd64"]: + self._arch = "amd64" + elif arch in ["aarch64", "arm64", "armv8b", "armv8l"]: + self._arch = "arm64" + else: + self._arch = arch events = self._charm.on[relation_name] self.framework.observe(events.relation_created, self._on_relation_created) @@ -2537,7 +2562,9 @@ def _update_logging(self, _): return for container in self._charm.unit.containers.values(): - self._update_endpoints(container, loki_endpoints) + if container.can_connect(): + self._update_endpoints(container, loki_endpoints) + # else: `_update_endpoints` will be called on pebble-ready anyway. def _retrieve_endpoints_from_relation(self) -> dict: loki_endpoints = {} diff --git a/lib/charms/observability_libs/v1/cert_handler.py b/lib/charms/observability_libs/v1/cert_handler.py index c482662..3b87ad4 100644 --- a/lib/charms/observability_libs/v1/cert_handler.py +++ b/lib/charms/observability_libs/v1/cert_handler.py @@ -58,7 +58,7 @@ import logging -from ops.charm import CharmBase, RelationBrokenEvent +from ops.charm import CharmBase from ops.framework import EventBase, EventSource, Object, ObjectEvents from ops.jujuversion import JujuVersion from ops.model import Relation, Secret, SecretNotFoundError @@ -67,7 +67,7 @@ LIBID = "b5cd5cd580f3428fa5f59a8876dcbe6a" LIBAPI = 1 -LIBPATCH = 8 +LIBPATCH = 11 VAULT_SECRET_LABEL = "cert-handler-private-vault" @@ -260,7 +260,13 @@ def retrieve(self) -> Dict[str, str]: def clear(self): """Clear the vault.""" - self._backend.clear() + try: + self._backend.clear() + except SecretNotFoundError: + # guard against: https://github.com/canonical/observability-libs/issues/95 + # this is fine, it might mean an earlier hook had already called .clear() + # not sure what exactly the root cause is, might be a juju bug + logger.debug("Could not clear vault: secret is gone already.") class CertHandler(Object): @@ -274,6 +280,7 @@ def __init__( *, key: str, certificates_relation_name: str = "certificates", + peer_relation_name: str = "peers", cert_subject: Optional[str] = None, sans: Optional[List[str]] = None, ): @@ -285,7 +292,11 @@ def __init__( charm: The owning charm. key: A manually-crafted, static, unique identifier used by ops to identify events. It shouldn't change between one event to another. - certificates_relation_name: Must match metadata.yaml. + certificates_relation_name: Name of the certificates relation over which we obtain TLS certificates. + Must match metadata.yaml. + peer_relation_name: Name of a peer relation used to store our secrets. + Only used on older Juju versions where secrets are not supported. + Must match metadata.yaml. cert_subject: Custom subject. Name collisions are under the caller's responsibility. sans: DNS names. If none are given, use FQDN. """ @@ -309,7 +320,7 @@ def __init__( # self.framework.observe(self.charm.on.secret_remove, self._rotate_csr) else: - vault_backend = _RelationVaultBackend(charm, relation_name="peers") + vault_backend = _RelationVaultBackend(charm, relation_name=peer_relation_name) self.vault = Vault(vault_backend) self.certificates_relation_name = certificates_relation_name @@ -339,10 +350,6 @@ def __init__( self.certificates.on.all_certificates_invalidated, # pyright: ignore self._on_all_certificates_invalidated, ) - self.framework.observe( - self.charm.on[self.certificates_relation_name].relation_broken, # pyright: ignore - self._on_certificates_relation_broken, - ) self.framework.observe( self.charm.on.upgrade_charm, # pyright: ignore self._on_upgrade_charm, @@ -391,30 +398,37 @@ def _migrate_vault(self): @property def enabled(self) -> bool: - """Boolean indicating whether the charm has a tls_certificates relation.""" + """Boolean indicating whether the charm has a tls_certificates relation. + + See also the `available` property. + """ # We need to check for units as a temporary workaround because of https://bugs.launchpad.net/juju/+bug/2024583 # This could in theory not work correctly on scale down to 0 but it is necessary for the moment. - if not self.charm.model.get_relation(self.certificates_relation_name): + if not self.relation: return False - if not self.charm.model.get_relation( - self.certificates_relation_name - ).units: # pyright: ignore + if not self.relation.units: # pyright: ignore return False - if not self.charm.model.get_relation( - self.certificates_relation_name - ).app: # pyright: ignore + if not self.relation.app: # pyright: ignore return False - if not self.charm.model.get_relation( - self.certificates_relation_name - ).data: # pyright: ignore + if not self.relation.data: # pyright: ignore return False return True + @property + def available(self) -> bool: + """Return True if all certs are available in relation data; False otherwise.""" + return ( + self.enabled + and self.server_cert is not None + and self.private_key is not None + and self.ca_cert is not None + ) + def _on_certificates_relation_joined(self, _) -> None: # this will only generate a csr if we don't have one already self._generate_csr() @@ -507,7 +521,7 @@ def _csr(self) -> Optional[str]: # ignoring all but the last one. if len(csrs) > 1: logger.warning( - "Multiple CSRs found in `certificates` relation. " + f"Multiple CSRs found in {self.certificates_relation_name!r} relation. " "cert_handler is not ready to expect it." ) @@ -562,14 +576,13 @@ def _on_certificate_invalidated(self, event: CertificateInvalidatedEvent) -> Non self.on.cert_changed.emit() # pyright: ignore def _on_all_certificates_invalidated(self, _: AllCertificatesInvalidatedEvent) -> None: - # Do what you want with this information, probably remove all certificates - # Note: assuming "limit: 1" in metadata - self._generate_csr(overwrite=True, clear_cert=True) - self.on.cert_changed.emit() # pyright: ignore - - def _on_certificates_relation_broken(self, _: RelationBrokenEvent) -> None: """Clear all secrets data when removing the relation.""" + # Note: assuming "limit: 1" in metadata + # The "certificates_relation_broken" event is converted to "all invalidated" custom + # event by the tls-certificates library. Per convention, we let the lib manage the + # relation and we do not observe "certificates_relation_broken" directly. self.vault.clear() + # We do not generate a CSR here because the relation is gone. self.on.cert_changed.emit() # pyright: ignore def _check_juju_supports_secrets(self) -> bool: diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index 72c3fe7..e3d35c6 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -178,7 +178,7 @@ def __init__(self, *args): - `scrape_timeout` - `proxy_url` - `relabel_configs` -- `metrics_relabel_configs` +- `metric_relabel_configs` - `sample_limit` - `label_limit` - `label_name_length_limit` @@ -362,7 +362,7 @@ def _on_scrape_targets_changed(self, event): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 45 +LIBPATCH = 47 PYDEPS = ["cosl"] @@ -377,7 +377,7 @@ def _on_scrape_targets_changed(self, event): "scrape_timeout", "proxy_url", "relabel_configs", - "metrics_relabel_configs", + "metric_relabel_configs", "sample_limit", "label_limit", "label_name_length_limit", @@ -521,8 +521,8 @@ def expand_wildcard_targets_into_individual_jobs( # for such a target. Therefore labeling with Juju topology, excluding the # unit name. non_wildcard_static_config["labels"] = { - **non_wildcard_static_config.get("labels", {}), **topology.label_matcher_dict, + **non_wildcard_static_config.get("labels", {}), } non_wildcard_static_configs.append(non_wildcard_static_config) @@ -547,9 +547,9 @@ def expand_wildcard_targets_into_individual_jobs( if topology: # Add topology labels modified_static_config["labels"] = { - **modified_static_config.get("labels", {}), **topology.label_matcher_dict, **{"juju_unit": unit_name}, + **modified_static_config.get("labels", {}), } # Instance relabeling for topology should be last in order. diff --git a/lib/charms/tempo_k8s/v1/charm_tracing.py b/lib/charms/tempo_k8s/v1/charm_tracing.py index 7c11885..4306b4d 100644 --- a/lib/charms/tempo_k8s/v1/charm_tracing.py +++ b/lib/charms/tempo_k8s/v1/charm_tracing.py @@ -9,21 +9,57 @@ This means that, if your charm is related to, for example, COS' Tempo charm, you will be able to inspect in real time from the Grafana dashboard the execution flow of your charm. -To start using this library, you need to do two things: +# Quickstart +Fetch the following charm libs (and ensure the minimum version/revision numbers are satisfied): + + charmcraft fetch-lib charms.tempo_k8s.v2.tracing # >= 1.10 + charmcraft fetch-lib charms.tempo_k8s.v1.charm_tracing # >= 2.7 + +Then edit your charm code to include: + +```python +# import the necessary charm libs +from charms.tempo_k8s.v2.tracing import TracingEndpointRequirer, charm_tracing_config +from charms.tempo_k8s.v1.charm_tracing import charm_tracing + +# decorate your charm class with charm_tracing: +@charm_tracing( + # forward-declare the instance attributes that the instrumentor will look up to obtain the + # tempo endpoint and server certificate + tracing_endpoint="tracing_endpoint", + server_cert="server_cert" +) +class MyCharm(CharmBase): + _path_to_cert = "/path/to/cert.crt" + # path to cert file **in the charm container**. Its presence will be used to determine whether + # the charm is ready to use tls for encrypting charm traces. If your charm does not support tls, + # you can ignore this and pass None to charm_tracing_config. + # If you do support TLS, you'll need to make sure that the server cert is copied to this location + # and kept up to date so the instrumentor can use it. + + def __init__(self, ...): + ... + self.tracing = TracingEndpointRequirer(self, ...) + self.tracing_endpoint, self.server_cert = charm_tracing_config(self.tracing, self._path_to_cert) +``` + +# Detailed usage +To use this library, you need to do two things: 1) decorate your charm class with `@trace_charm(tracing_endpoint="my_tracing_endpoint")` -2) add to your charm a "my_tracing_endpoint" (you can name this attribute whatever you like) **property** -that returns an otlp http/https endpoint url. If you are using the `TracingEndpointProvider` as -`self.tracing = TracingEndpointProvider(self)`, the implementation could be: +2) add to your charm a "my_tracing_endpoint" (you can name this attribute whatever you like) +**property**, **method** or **instance attribute** that returns an otlp http/https endpoint url. +If you are using the ``charms.tempo_k8s.v2.tracing.TracingEndpointRequirer`` as +``self.tracing = TracingEndpointRequirer(self)``, the implementation could be: ``` @property def my_tracing_endpoint(self) -> Optional[str]: '''Tempo endpoint for charm tracing''' if self.tracing.is_ready(): - return self.tracing.otlp_http_endpoint() + return self.tracing.get_endpoint("otlp_http") else: return None ``` @@ -33,19 +69,52 @@ def my_tracing_endpoint(self) -> Optional[str]: - every event as a span (including custom events) - every charm method call (except dunders) as a span -if you wish to add more fine-grained information to the trace, you can do so by getting a hold of the tracer like so: + +## TLS support +If your charm integrates with a TLS provider which is also trusted by the tracing provider (the Tempo charm), +you can configure ``charm_tracing`` to use TLS by passing a ``server_cert`` parameter to the decorator. + +If your charm is not trusting the same CA as the Tempo endpoint it is sending traces to, +you'll need to implement a cert-transfer relation to obtain the CA certificate from the same +CA that Tempo is using. + +For example: +``` +from charms.tempo_k8s.v1.charm_tracing import trace_charm +@trace_charm( + tracing_endpoint="my_tracing_endpoint", + server_cert="_server_cert" +) +class MyCharm(CharmBase): + self._server_cert = "/path/to/server.crt" + ... + + def on_tls_changed(self, e) -> Optional[str]: + # update the server cert on the charm container for charm tracing + Path(self._server_cert).write_text(self.get_server_cert()) + + def on_tls_broken(self, e) -> Optional[str]: + # remove the server cert so charm_tracing won't try to use tls anymore + Path(self._server_cert).unlink() +``` + + +## More fine-grained manual instrumentation +if you wish to add more spans to the trace, you can do so by getting a hold of the tracer like so: ``` import opentelemetry ... - @property - def tracer(self) -> opentelemetry.trace.Tracer: - return opentelemetry.trace.get_tracer(type(self).__name__) +def get_tracer(self) -> opentelemetry.trace.Tracer: + return opentelemetry.trace.get_tracer(type(self).__name__) ``` By default, the tracer is named after the charm type. If you wish to override that, you can pass -a different `service_name` argument to `trace_charm`. +a different ``service_name`` argument to ``trace_charm``. -*Upgrading from `v0`:* +See the official opentelemetry Python SDK documentation for usage: +https://opentelemetry-python.readthedocs.io/en/latest/ + +## Upgrading from `v0` If you are upgrading from `charm_tracing` v0, you need to take the following steps (assuming you already have the newest version of the library in your charm): @@ -55,8 +124,9 @@ def tracer(self) -> opentelemetry.trace.Tracer: `opentelemetry-exporter-otlp-proto-http>=1.21.0`. -2) Update the charm method referenced to from `@trace` and `@trace_charm`, -to return from `TracingEndpointRequirer.otlp_http_endpoint()` instead of `grpc_http`. For example: +2) Update the charm method referenced to from ``@trace`` and ``@trace_charm``, +to return from ``TracingEndpointRequirer.get_endpoint("otlp_http")`` instead of ``grpc_http``. +For example: ``` from charms.tempo_k8s.v0.charm_tracing import trace_charm @@ -72,7 +142,7 @@ class MyCharm(CharmBase): def my_tracing_endpoint(self) -> Optional[str]: '''Tempo endpoint for charm tracing''' if self.tracing.is_ready(): - return self.tracing.otlp_grpc_endpoint() + return self.tracing.otlp_grpc_endpoint() # OLD API, DEPRECATED. else: return None ``` @@ -93,21 +163,23 @@ class MyCharm(CharmBase): def my_tracing_endpoint(self) -> Optional[str]: '''Tempo endpoint for charm tracing''' if self.tracing.is_ready(): - return self.tracing.otlp_http_endpoint() + return self.tracing.get_endpoint("otlp_http") # NEW API, use this. else: return None ``` -3) If you were passing a certificate using `server_cert`, you need to change it to provide an *absolute* path to -the certificate file. +3) If you were passing a certificate (str) using `server_cert`, you need to change it to +provide an *absolute* path to the certificate file instead. """ import functools import inspect import logging import os +import shutil from contextlib import contextmanager from contextvars import Context, ContextVar, copy_context +from importlib.metadata import distributions from pathlib import Path from typing import ( Any, @@ -122,19 +194,19 @@ def my_tracing_endpoint(self) -> Optional[str]: ) import opentelemetry +import ops from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import Span, TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.trace import INVALID_SPAN, Tracer +from opentelemetry.trace import get_current_span as otlp_get_current_span from opentelemetry.trace import ( - INVALID_SPAN, - Tracer, get_tracer, get_tracer_provider, set_span_in_context, set_tracer_provider, ) -from opentelemetry.trace import get_current_span as otlp_get_current_span from ops.charm import CharmBase from ops.framework import Framework @@ -147,14 +219,23 @@ def my_tracing_endpoint(self) -> Optional[str]: # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 6 +LIBPATCH = 13 PYDEPS = ["opentelemetry-exporter-otlp-proto-http==1.21.0"] logger = logging.getLogger("tracing") +dev_logger = logging.getLogger("tracing-dev") + +# set this to 0 if you are debugging/developing this library source +dev_logger.setLevel(logging.CRITICAL) + +_CharmType = Type[CharmBase] # the type CharmBase and any subclass thereof +_C = TypeVar("_C", bound=_CharmType) +_T = TypeVar("_T", bound=type) +_F = TypeVar("_F", bound=Type[Callable]) tracer: ContextVar[Tracer] = ContextVar("tracer") -_GetterType = Union[Callable[[CharmBase], Optional[str]], property] +_GetterType = Union[Callable[[_CharmType], Optional[str]], property] CHARM_TRACING_ENABLED = "CHARM_TRACING_ENABLED" @@ -220,11 +301,6 @@ def _span(name: str) -> Generator[Optional[Span], Any, Any]: yield None -_C = TypeVar("_C", bound=Type[CharmBase]) -_T = TypeVar("_T", bound=type) -_F = TypeVar("_F", bound=Type[Callable]) - - class TracingError(RuntimeError): """Base class for errors raised by this module.""" @@ -233,60 +309,102 @@ class UntraceableObjectError(TracingError): """Raised when an object you're attempting to instrument cannot be autoinstrumented.""" -def _get_tracing_endpoint(tracing_endpoint_getter, self, charm): - if isinstance(tracing_endpoint_getter, property): - tracing_endpoint = tracing_endpoint_getter.__get__(self) - else: # method or callable - tracing_endpoint = tracing_endpoint_getter(self) +class TLSError(TracingError): + """Raised when the tracing endpoint is https but we don't have a cert yet.""" + + +def _get_tracing_endpoint( + tracing_endpoint_attr: str, + charm_instance: object, + charm_type: type, +): + _tracing_endpoint = getattr(charm_instance, tracing_endpoint_attr) + if callable(_tracing_endpoint): + tracing_endpoint = _tracing_endpoint() + else: + tracing_endpoint = _tracing_endpoint if tracing_endpoint is None: - logger.debug( - f"{charm}.{tracing_endpoint_getter} returned None; quietly disabling " - f"charm_tracing for the run." - ) return + elif not isinstance(tracing_endpoint, str): raise TypeError( - f"{charm}.{tracing_endpoint_getter} should return a tempo endpoint (string); " + f"{charm_type.__name__}.{tracing_endpoint_attr} should resolve to a tempo endpoint (string); " f"got {tracing_endpoint} instead." ) - else: - logger.debug(f"Setting up span exporter to endpoint: {tracing_endpoint}/v1/traces") + + dev_logger.debug(f"Setting up span exporter to endpoint: {tracing_endpoint}/v1/traces") return f"{tracing_endpoint}/v1/traces" -def _get_server_cert(server_cert_getter, self, charm): - if isinstance(server_cert_getter, property): - server_cert = server_cert_getter.__get__(self) - else: # method or callable - server_cert = server_cert_getter(self) +def _get_server_cert( + server_cert_attr: str, + charm_instance: ops.CharmBase, + charm_type: Type[ops.CharmBase], +): + _server_cert = getattr(charm_instance, server_cert_attr) + if callable(_server_cert): + server_cert = _server_cert() + else: + server_cert = _server_cert if server_cert is None: logger.warning( - f"{charm}.{server_cert_getter} returned None; sending traces over INSECURE connection." + f"{charm_type}.{server_cert_attr} is None; sending traces over INSECURE connection." ) return elif not Path(server_cert).is_absolute(): raise ValueError( - f"{charm}.{server_cert_getter} should return a valid tls cert absolute path (string | Path)); " + f"{charm_type}.{server_cert_attr} should resolve to a valid tls cert absolute path (string | Path)); " f"got {server_cert} instead." ) return server_cert +def _remove_stale_otel_sdk_packages(): + """Hack to remove stale opentelemetry sdk packages from the charm's python venv. + + See https://github.com/canonical/grafana-agent-operator/issues/146 and + https://bugs.launchpad.net/juju/+bug/2058335 for more context. This patch can be removed after + this juju issue is resolved and sufficient time has passed to expect most users of this library + have migrated to the patched version of juju. + + This only does something if executed on an upgrade-charm event. + """ + if os.getenv("JUJU_DISPATCH_PATH") == "hooks/upgrade-charm": + logger.debug("Executing _remove_stale_otel_sdk_packages patch on charm upgrade") + # Find any opentelemetry_sdk distributions + otel_sdk_distributions = list(distributions(name="opentelemetry_sdk")) + # If there is more than 1, inspect each and if it has 0 entrypoints, infer that it is stale + if len(otel_sdk_distributions) > 1: + for distribution in otel_sdk_distributions: + if len(distribution.entry_points) == 0: + # Distribution appears to be empty. Remove it + path = distribution._path # type: ignore + logger.debug(f"Removing empty opentelemetry_sdk distribution at: {path}") + shutil.rmtree(path) + + def _setup_root_span_initializer( - charm: Type[CharmBase], - tracing_endpoint_getter: _GetterType, - server_cert_getter: Optional[_GetterType], + charm_type: _CharmType, + tracing_endpoint_attr: str, + server_cert_attr: Optional[str], service_name: Optional[str] = None, ): """Patch the charm's initializer.""" - original_init = charm.__init__ + original_init = charm_type.__init__ @functools.wraps(original_init) def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): + # we're using 'self' here because this is charm init code, makes sense to read what's below + # from the perspective of the charm. Self.unit.name... + original_init(self, framework, *args, **kwargs) + # we call this from inside the init context instead of, say, _autoinstrument, because we want it to + # be checked on a per-charm-instantiation basis, not on a per-type-declaration one. if not is_enabled(): + # this will only happen during unittesting, hopefully, so it's fine to log a + # bit more verbosely logger.info("Tracing DISABLED: skipping root span initialization") return @@ -295,41 +413,45 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): # self.handle = Handle(None, self.handle_kind, None) original_event_context = framework._event_context - - _service_name = service_name or self.app.name - + # default service name isn't just app name because it could conflict with the workload service name + _service_name = service_name or f"{self.app.name}-charm" + + unit_name = self.unit.name + # apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm. + # it could be trouble if someone ever decides to implement their own tracer parallel to + # ours and before the charm has inited. We assume they won't. + _remove_stale_otel_sdk_packages() resource = Resource.create( attributes={ "service.name": _service_name, "compose_service": _service_name, "charm_type": type(self).__name__, # juju topology - "juju_unit": self.unit.name, + "juju_unit": unit_name, "juju_application": self.app.name, "juju_model": self.model.name, "juju_model_uuid": self.model.uuid, } ) provider = TracerProvider(resource=resource) - try: - tracing_endpoint = _get_tracing_endpoint(tracing_endpoint_getter, self, charm) - except Exception: - # if anything goes wrong with retrieving the endpoint, we go on with tracing disabled. - # better than breaking the charm. - logger.exception( - f"exception retrieving the tracing " - f"endpoint from {charm}.{tracing_endpoint_getter}; " - f"proceeding with charm_tracing DISABLED. " - ) - return + + # if anything goes wrong with retrieving the endpoint, we let the exception bubble up. + tracing_endpoint = _get_tracing_endpoint(tracing_endpoint_attr, self, charm_type) if not tracing_endpoint: + # tracing is off if tracing_endpoint is None return server_cert: Optional[Union[str, Path]] = ( - _get_server_cert(server_cert_getter, self, charm) if server_cert_getter else None + _get_server_cert(server_cert_attr, self, charm_type) if server_cert_attr else None ) + if tracing_endpoint.startswith("https://") and not server_cert: + raise TLSError( + "Tracing endpoint is https, but no server_cert has been passed." + "Please point @trace_charm to a `server_cert` attr." + ) + exporter = OTLPSpanExporter( endpoint=tracing_endpoint, certificate_file=str(Path(server_cert).absolute()) if server_cert else None, @@ -342,16 +464,18 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): _tracer = get_tracer(_service_name) # type: ignore _tracer_token = tracer.set(_tracer) - dispatch_path = os.getenv("JUJU_DISPATCH_PATH", "") + dispatch_path = os.getenv("JUJU_DISPATCH_PATH", "") # something like hooks/install + event_name = dispatch_path.split("/")[1] if "/" in dispatch_path else dispatch_path + root_span_name = f"{unit_name}: {event_name} event" + span = _tracer.start_span(root_span_name, attributes={"juju.dispatch_path": dispatch_path}) # all these shenanigans are to work around the fact that the opentelemetry tracing API is built # on the assumption that spans will be used as contextmanagers. # Since we don't (as we need to close the span on framework.commit), # we need to manually set the root span as current. - span = _tracer.start_span("charm exec", attributes={"juju.dispatch_path": dispatch_path}) ctx = set_span_in_context(span) - # log a trace id so we can look it up in tempo. + # log a trace id, so we can pick it up from the logs (and jhack) to look it up in tempo. root_trace_id = hex(span.get_span_context().trace_id)[2:] # strip 0x prefix logger.debug(f"Starting root trace with id={root_trace_id!r}.") @@ -359,6 +483,7 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): @contextmanager def wrap_event_context(event_name: str): + dev_logger.info(f"entering event context: {event_name}") # when the framework enters an event context, we create a span. with _span("event: " + event_name) as event_context_span: if event_context_span: @@ -372,6 +497,7 @@ def wrap_event_context(event_name: str): @functools.wraps(original_close) def wrap_close(): + dev_logger.info("tearing down tracer and flushing traces") span.end() opentelemetry.context.detach(span_token) # type: ignore tracer.reset(_tracer_token) @@ -383,7 +509,7 @@ def wrap_close(): framework.close = wrap_close return - charm.__init__ = wrap_init + charm_type.__init__ = wrap_init # type: ignore def trace_charm( @@ -391,7 +517,7 @@ def trace_charm( server_cert: Optional[str] = None, service_name: Optional[str] = None, extra_types: Sequence[type] = (), -): +) -> Callable[[_T], _T]: """Autoinstrument the decorated charm with tracing telemetry. Use this function to get out-of-the-box traces for all events emitted on this charm and all @@ -399,7 +525,7 @@ def trace_charm( Usage: >>> from charms.tempo_k8s.v1.charm_tracing import trace_charm - >>> from charms.tempo_k8s.v1.tracing import TracingEndpointProvider + >>> from charms.tempo_k8s.v1.tracing import TracingEndpointRequirer >>> from ops import CharmBase >>> >>> @trace_charm( @@ -409,7 +535,7 @@ def trace_charm( >>> >>> def __init__(self, framework: Framework): >>> ... - >>> self.tracing = TracingEndpointProvider(self) + >>> self.tracing = TracingEndpointRequirer(self) >>> >>> @property >>> def tempo_otlp_http_endpoint(self) -> Optional[str]: @@ -418,24 +544,28 @@ def trace_charm( >>> else: >>> return None >>> - :param server_cert: method or property on the charm type that returns an - optional absolute path to a tls certificate to be used when sending traces to a remote server. - If it returns None, an _insecure_ connection will be used. - :param tracing_endpoint: name of a property on the charm type that returns an - optional (fully resolvable) tempo url. If None, tracing will be effectively disabled. Else, traces will be - pushed to that endpoint. + + :param tracing_endpoint: name of a method, property or attribute on the charm type that returns an + optional (fully resolvable) tempo url to which the charm traces will be pushed. + If None, tracing will be effectively disabled. + :param server_cert: name of a method, property or attribute on the charm type that returns an + optional absolute path to a CA certificate file to be used when sending traces to a remote server. + If it returns None, an _insecure_ connection will be used. To avoid errors in transient + situations where the endpoint is already https but there is no certificate on disk yet, it + is recommended to disable tracing (by returning None from the tracing_endpoint) altogether + until the cert has been written to disk. :param service_name: service name tag to attach to all traces generated by this charm. Defaults to the juju application name this charm is deployed under. :param extra_types: pass any number of types that you also wish to autoinstrument. For example, charm libs, relation endpoint wrappers, workload abstractions, ... """ - def _decorator(charm_type: Type[CharmBase]): + def _decorator(charm_type: _T) -> _T: """Autoinstrument the wrapped charmbase type.""" _autoinstrument( charm_type, - tracing_endpoint_getter=getattr(charm_type, tracing_endpoint), - server_cert_getter=getattr(charm_type, server_cert) if server_cert else None, + tracing_endpoint_attr=tracing_endpoint, + server_cert_attr=server_cert, service_name=service_name, extra_types=extra_types, ) @@ -445,12 +575,12 @@ def _decorator(charm_type: Type[CharmBase]): def _autoinstrument( - charm_type: Type[CharmBase], - tracing_endpoint_getter: _GetterType, - server_cert_getter: Optional[_GetterType] = None, + charm_type: _T, + tracing_endpoint_attr: str, + server_cert_attr: Optional[str] = None, service_name: Optional[str] = None, extra_types: Sequence[type] = (), -) -> Type[CharmBase]: +) -> _T: """Set up tracing on this charm class. Use this function to get out-of-the-box traces for all events emitted on this charm and all @@ -462,29 +592,32 @@ def _autoinstrument( >>> from ops.main import main >>> _autoinstrument( >>> MyCharm, - >>> tracing_endpoint_getter=MyCharm.tempo_otlp_http_endpoint, + >>> tracing_endpoint_attr="tempo_otlp_http_endpoint", >>> service_name="MyCharm", >>> extra_types=(Foo, Bar) >>> ) >>> main(MyCharm) :param charm_type: the CharmBase subclass to autoinstrument. - :param server_cert_getter: method or property on the charm type that returns an - optional absolute path to a tls certificate to be used when sending traces to a remote server. - This needs to be a valid path to a certificate. - :param tracing_endpoint_getter: method or property on the charm type that returns an - optional tempo url. If None, tracing will be effectively disabled. Else, traces will be - pushed to that endpoint. + :param tracing_endpoint_attr: name of a method, property or attribute on the charm type that returns an + optional (fully resolvable) tempo url to which the charm traces will be pushed. + If None, tracing will be effectively disabled. + :param server_cert_attr: name of a method, property or attribute on the charm type that returns an + optional absolute path to a CA certificate file to be used when sending traces to a remote server. + If it returns None, an _insecure_ connection will be used. To avoid errors in transient + situations where the endpoint is already https but there is no certificate on disk yet, it + is recommended to disable tracing (by returning None from the tracing_endpoint) altogether + until the cert has been written to disk. :param service_name: service name tag to attach to all traces generated by this charm. Defaults to the juju application name this charm is deployed under. :param extra_types: pass any number of types that you also wish to autoinstrument. For example, charm libs, relation endpoint wrappers, workload abstractions, ... """ - logger.info(f"instrumenting {charm_type}") + dev_logger.info(f"instrumenting {charm_type}") _setup_root_span_initializer( charm_type, - tracing_endpoint_getter, - server_cert_getter=server_cert_getter, + tracing_endpoint_attr, + server_cert_attr=server_cert_attr, service_name=service_name, ) trace_type(charm_type) @@ -501,50 +634,66 @@ def trace_type(cls: _T) -> _T: It assumes that this class is only instantiated after a charm type decorated with `@trace_charm` has been instantiated. """ - logger.info(f"instrumenting {cls}") + dev_logger.info(f"instrumenting {cls}") for name, method in inspect.getmembers(cls, predicate=inspect.isfunction): - logger.info(f"discovered {method}") + dev_logger.info(f"discovered {method}") if method.__name__.startswith("__"): - logger.info(f"skipping {method} (dunder)") + dev_logger.info(f"skipping {method} (dunder)") continue - isstatic = isinstance(inspect.getattr_static(cls, method.__name__), staticmethod) - setattr(cls, name, trace_method(method, static=isstatic)) + # the span title in the general case should be: + # method call: MyCharmWrappedMethods.b + # if the method has a name (functools.wrapped or regular method), let + # _trace_callable use its default algorithm to determine what name to give the span. + trace_method_name = None + try: + qualname_c0 = method.__qualname__.split(".")[0] + if not hasattr(cls, method.__name__): + # if the callable doesn't have a __name__ (probably a decorated method), + # it probably has a bad qualname too (such as my_decorator..wrapper) which is not + # great for finding out what the trace is about. So we use the method name instead and + # add a reference to the decorator name. Result: + # method call: @my_decorator(MyCharmWrappedMethods.b) + trace_method_name = f"@{qualname_c0}({cls.__name__}.{name})" + except Exception: # noqa: failsafe + pass + + new_method = trace_method(method, name=trace_method_name) + + if isinstance(inspect.getattr_static(cls, name), staticmethod): + new_method = staticmethod(new_method) + setattr(cls, name, new_method) return cls -def trace_method(method: _F, static: bool = False) -> _F: +def trace_method(method: _F, name: Optional[str] = None) -> _F: """Trace this method. A span will be opened when this method is called and closed when it returns. """ - return _trace_callable(method, "method", static=static) + return _trace_callable(method, "method", name=name) -def trace_function(function: _F) -> _F: +def trace_function(function: _F, name: Optional[str] = None) -> _F: """Trace this function. A span will be opened when this function is called and closed when it returns. """ - return _trace_callable(function, "function") + return _trace_callable(function, "function", name=name) -def _trace_callable(callable: _F, qualifier: str, static: bool = False) -> _F: - logger.info(f"instrumenting {callable}") +def _trace_callable(callable: _F, qualifier: str, name: Optional[str] = None) -> _F: + dev_logger.info(f"instrumenting {callable}") # sig = inspect.signature(callable) @functools.wraps(callable) def wrapped_function(*args, **kwargs): # type: ignore - name = getattr(callable, "__qualname__", getattr(callable, "__name__", str(callable))) - with _span(f"{'(static) ' if static else ''}{qualifier} call: {name}"): # type: ignore - if static: - # fixme: do we or don't we need [1:]? - # The _trace_callable decorator doesn't always play nice with @staticmethods. - # Sometimes it will receive 'self', sometimes it won't. - # return callable(*args, **kwargs) # type: ignore - return callable(*args[1:], **kwargs) # type: ignore + name_ = name or getattr( + callable, "__qualname__", getattr(callable, "__name__", str(callable)) + ) + with _span(f"{qualifier} call: {name_}"): # type: ignore return callable(*args, **kwargs) # type: ignore # wrapped_function.__signature__ = sig diff --git a/lib/charms/tempo_k8s/v2/tracing.py b/lib/charms/tempo_k8s/v2/tracing.py index 9e5defc..dfb2336 100644 --- a/lib/charms/tempo_k8s/v2/tracing.py +++ b/lib/charms/tempo_k8s/v2/tracing.py @@ -69,8 +69,10 @@ def __init__(self, *args): """ # noqa: W505 +import enum import json import logging +from pathlib import Path from typing import ( TYPE_CHECKING, Any, @@ -81,6 +83,7 @@ def __init__(self, *args): Optional, Sequence, Tuple, + Union, cast, ) @@ -94,7 +97,7 @@ def __init__(self, *args): ) from ops.framework import EventSource, Object from ops.model import ModelError, Relation -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict, Field # The unique Charmhub library identifier, never change it LIBID = "12977e9aa0b34367903d8afeb8c3d85d" @@ -104,7 +107,7 @@ def __init__(self, *args): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 5 +LIBPATCH = 8 PYDEPS = ["pydantic"] @@ -113,24 +116,41 @@ def __init__(self, *args): DEFAULT_RELATION_NAME = "tracing" RELATION_INTERFACE_NAME = "tracing" +# Supported list rationale https://github.com/canonical/tempo-coordinator-k8s-operator/issues/8 ReceiverProtocol = Literal[ "zipkin", - "kafka", - "opencensus", - "tempo_http", - "tempo_grpc", "otlp_grpc", "otlp_http", - # "jaeger_grpc", - "jaeger_thrift_compact", + "jaeger_grpc", "jaeger_thrift_http", - "jaeger_thrift_binary", ] -RawReceiver = Tuple[ReceiverProtocol, int] +RawReceiver = Tuple[ReceiverProtocol, str] +"""Helper type. A raw receiver is defined as a tuple consisting of the protocol name, and the (external, if available), +(secured, if available) resolvable server url. +""" + BUILTIN_JUJU_KEYS = {"ingress-address", "private-address", "egress-subnets"} +class TransportProtocolType(str, enum.Enum): + """Receiver Type.""" + + http = "http" + grpc = "grpc" + + +receiver_protocol_to_transport_protocol: Dict[ReceiverProtocol, TransportProtocolType] = { + "zipkin": TransportProtocolType.http, + "otlp_grpc": TransportProtocolType.grpc, + "otlp_http": TransportProtocolType.http, + "jaeger_thrift_http": TransportProtocolType.http, + "jaeger_grpc": TransportProtocolType.grpc, +} +"""A mapping between telemetry protocols and their corresponding transport protocol. +""" + + class TracingError(Exception): """Base class for custom errors raised by this library.""" @@ -289,27 +309,81 @@ def dump(self, databag: Optional[MutableMapping] = None, clear: bool = True): # todo use models from charm-relation-interfaces -class Receiver(BaseModel): # noqa: D101 - """Receiver data structure.""" +if int(pydantic.version.VERSION.split(".")[0]) < 2: + + class ProtocolType(BaseModel): # type: ignore + """Protocol Type.""" - protocol: ReceiverProtocol - port: int + class Config: + """Pydantic config.""" + use_enum_values = True + """Allow serializing enum values.""" -class TracingProviderAppData(DatabagModel): # noqa: D101 - """Application databag model for the tracing provider.""" + name: str = Field( + ..., + description="Receiver protocol name. What protocols are supported (and what they are called) " + "may differ per provider.", + examples=["otlp_grpc", "otlp_http", "tempo_http"], + ) + + type: TransportProtocolType = Field( + ..., + description="The transport protocol used by this receiver.", + examples=["http", "grpc"], + ) + +else: + + class ProtocolType(BaseModel): + """Protocol Type.""" - host: str - """Server hostname (local fqdn).""" + model_config = ConfigDict( + # Allow serializing enum values. + use_enum_values=True + ) + """Pydantic config.""" - receivers: List[Receiver] - """Enabled receivers and ports at which they are listening.""" + name: str = Field( + ..., + description="Receiver protocol name. What protocols are supported (and what they are called) " + "may differ per provider.", + examples=["otlp_grpc", "otlp_http", "tempo_http"], + ) + + type: TransportProtocolType = Field( + ..., + description="The transport protocol used by this receiver.", + examples=["http", "grpc"], + ) - external_url: Optional[str] = None - """Server url. If an ingress is present, it will be the ingress address.""" - internal_scheme: Optional[str] = None - """Scheme for internal communication. If it is present, it will be protocol accepted by the provider.""" +class Receiver(BaseModel): + """Specification of an active receiver.""" + + protocol: ProtocolType = Field(..., description="Receiver protocol name and type.") + url: str = Field( + ..., + description="""URL at which the receiver is reachable. If there's an ingress, it would be the external URL. + Otherwise, it would be the service's fqdn or internal IP. + If the protocol type is grpc, the url will not contain a scheme.""", + examples=[ + "http://traefik_address:2331", + "https://traefik_address:2331", + "http://tempo_public_ip:2331", + "https://tempo_public_ip:2331", + "tempo_public_ip:2331", + ], + ) + + +class TracingProviderAppData(DatabagModel): # noqa: D101 + """Application databag model for the tracing provider.""" + + receivers: List[Receiver] = Field( + ..., + description="List of all receivers enabled on the tracing provider.", + ) class TracingRequirerAppData(DatabagModel): # noqa: D101 @@ -481,10 +555,15 @@ def requested_receivers(self) -> List[ReceiverProtocol]: return TracingRequirerAppData.load(relation.data[app]).receivers +class BrokenEvent(RelationBrokenEvent): + """Event emitted when a relation on tracing is broken.""" + + class TracingEndpointProviderEvents(CharmEvents): """TracingEndpointProvider events.""" request = EventSource(RequestEvent) + broken = EventSource(BrokenEvent) class TracingEndpointProvider(Object): @@ -495,21 +574,17 @@ class TracingEndpointProvider(Object): def __init__( self, charm: CharmBase, - host: str, external_url: Optional[str] = None, relation_name: str = DEFAULT_RELATION_NAME, - internal_scheme: Optional[Literal["http", "https"]] = "http", ): """Initialize. Args: charm: a `CharmBase` instance that manages this instance of the Tempo service. - host: address of the node hosting the tempo server. external_url: external address of the node hosting the tempo server, if an ingress is present. relation_name: an optional string name of the relation between `charm` and the Tempo charmed service. The default is "tracing". - internal_scheme: scheme to use with internal urls. Raises: RelationNotFoundError: If there is no relation in the charm's metadata.yaml @@ -525,12 +600,10 @@ def __init__( charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides ) - super().__init__(charm, relation_name + "tracing-provider-v2") + super().__init__(charm, relation_name + "tracing-provider") self._charm = charm - self._host = host self._external_url = external_url self._relation_name = relation_name - self._internal_scheme = internal_scheme self.framework.observe( self._charm.on[relation_name].relation_joined, self._on_relation_event ) @@ -540,18 +613,21 @@ def __init__( self.framework.observe( self._charm.on[relation_name].relation_changed, self._on_relation_event ) + self.framework.observe( + self._charm.on[relation_name].relation_broken, self._on_relation_broken_event + ) + + def _on_relation_broken_event(self, e: RelationBrokenEvent): + """Handle relation broken events.""" + self.on.broken.emit(e.relation) def _on_relation_event(self, e: RelationEvent): """Handle relation created/joined/changed events.""" - if self.is_v2(e.relation): + if self.is_requirer_ready(e.relation): self.on.request.emit(e.relation) - def is_v2(self, relation: Relation): - """Attempt to determine if this relation is a tracing v2 relation. - - Assumes that the V2 requirer will, as soon as possible (relation-created), - publish the list of requested ingestion receivers (can be empty too). - """ + def is_requirer_ready(self, relation: Relation): + """Attempt to determine if requirer has already populated app data.""" try: self._get_requested_protocols(relation) except NotReadyError: @@ -567,7 +643,7 @@ def _get_requested_protocols(relation: Relation): try: databag = TracingRequirerAppData.load(relation.data[app]) except (json.JSONDecodeError, pydantic.ValidationError, DataValidationError): - logger.info(f"relation {relation} is not ready to talk tracing v2") + logger.info(f"relation {relation} is not ready to talk tracing") raise NotReadyError() return databag.receivers @@ -584,8 +660,8 @@ def requested_protocols(self): @property def relations(self) -> List[Relation]: - """All v2 relations active on this endpoint.""" - return [r for r in self._charm.model.relations[self._relation_name] if self.is_v2(r)] + """All relations active on this endpoint.""" + return self._charm.model.relations[self._relation_name] def publish_receivers(self, receivers: Sequence[RawReceiver]): """Let all requirers know that these receivers are active and listening.""" @@ -595,12 +671,16 @@ def publish_receivers(self, receivers: Sequence[RawReceiver]): for relation in self.relations: try: TracingProviderAppData( - host=self._host, - external_url=self._external_url or None, receivers=[ - Receiver(port=port, protocol=protocol) for protocol, port in receivers + Receiver( + url=url, + protocol=ProtocolType( + name=protocol, + type=receiver_protocol_to_transport_protocol[protocol], + ), + ) + for protocol, url in receivers ], - internal_scheme=self._internal_scheme, ).dump(relation.data[self._charm.app]) except ModelError as e: @@ -625,11 +705,9 @@ class EndpointRemovedEvent(RelationBrokenEvent): class EndpointChangedEvent(_AutoSnapshotEvent): """Event representing a change in one of the receiver endpoints.""" - __args__ = ("host", "external_url", "_receivers") + __args__ = ("_receivers",) if TYPE_CHECKING: - host = "" # type: str - external_url = "" # type: str _receivers = [] # type: List[dict] @property @@ -769,12 +847,6 @@ def is_ready(self, relation: Optional[Relation] = None): return False try: databag = dict(relation.data[relation.app]) - # "ingesters" Might be populated if the provider sees a v1 relation before a v2 requirer has had time to - # publish the 'receivers' list. This will make Tempo incorrectly assume that this is a v1 - # relation, and act accordingly. Later, when the requirer publishes the requested receivers, - # tempo will be able to course-correct. - if "ingesters" in databag: - del databag["ingesters"] TracingProviderAppData.load(databag) except (json.JSONDecodeError, pydantic.ValidationError, DataValidationError): @@ -790,9 +862,7 @@ def _on_tracing_relation_changed(self, event): return data = TracingProviderAppData.load(relation.data[relation.app]) - self.on.endpoint_changed.emit( # type: ignore - relation, data.host, data.external_url, [i.dict() for i in data.receivers] - ) + self.on.endpoint_changed.emit(relation, [i.dict() for i in data.receivers]) # type: ignore def _on_tracing_relation_broken(self, event: RelationBrokenEvent): """Notify the providers that the endpoint is broken.""" @@ -815,7 +885,7 @@ def _get_endpoint( if not app_data: return None receivers: List[Receiver] = list( - filter(lambda i: i.protocol == protocol, app_data.receivers) + filter(lambda i: i.protocol.name == protocol, app_data.receivers) ) if not receivers: logger.error(f"no receiver found with protocol={protocol!r}") @@ -827,18 +897,7 @@ def _get_endpoint( return receiver = receivers[0] - # if there's an external_url argument (v2.5+), use that. Otherwise, we use the tempo local fqdn - if app_data.external_url: - url = f"{app_data.external_url}:{receiver.port}" - else: - # if we didn't receive a scheme (old provider), we assume HTTP is used - url = f"{app_data.internal_scheme or 'http'}://{app_data.host}:{receiver.port}" - - if receiver.protocol.endswith("grpc"): - # TCP protocols don't want an http/https scheme prefix - url = url.split("://")[1] - - return url + return receiver.url def get_endpoint( self, protocol: ReceiverProtocol, relation: Optional[Relation] = None @@ -862,19 +921,67 @@ def get_endpoint( return None return endpoint - # for backwards compatibility with earlier revisions: - def otlp_grpc_endpoint(self): - """Use TracingEndpointRequirer.get_endpoint('otlp_grpc') instead.""" - logger.warning( - "`TracingEndpointRequirer.otlp_grpc_endpoint` is deprecated. " - "Use `TracingEndpointRequirer.get_endpoint('otlp_grpc') instead.`" - ) - return self.get_endpoint("otlp_grpc") - def otlp_http_endpoint(self): - """Use TracingEndpointRequirer.get_endpoint('otlp_http') instead.""" - logger.warning( - "`TracingEndpointRequirer.otlp_http_endpoint` is deprecated. " - "Use `TracingEndpointRequirer.get_endpoint('otlp_http') instead.`" - ) - return self.get_endpoint("otlp_http") +def charm_tracing_config( + endpoint_requirer: TracingEndpointRequirer, cert_path: Optional[Union[Path, str]] +) -> Tuple[Optional[str], Optional[str]]: + """Utility function to determine the charm_tracing config you will likely want. + + If no endpoint is provided: + disable charm tracing. + If https endpoint is provided but cert_path is not found on disk: + disable charm tracing. + If https endpoint is provided and cert_path is None: + ERROR + Else: + proceed with charm tracing (with or without tls, as appropriate) + + Usage: + If you are using charm_tracing >= v1.9: + >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm + >>> from lib.charms.tempo_k8s.v2.tracing import charm_tracing_config + >>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path") + >>> class MyCharm(...): + >>> _cert_path = "/path/to/cert/on/charm/container.crt" + >>> def __init__(self, ...): + >>> self.tracing = TracingEndpointRequirer(...) + >>> self.my_endpoint, self.cert_path = charm_tracing_config( + ... self.tracing, self._cert_path) + + If you are using charm_tracing < v1.9: + >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm + >>> from lib.charms.tempo_k8s.v2.tracing import charm_tracing_config + >>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path") + >>> class MyCharm(...): + >>> _cert_path = "/path/to/cert/on/charm/container.crt" + >>> def __init__(self, ...): + >>> self.tracing = TracingEndpointRequirer(...) + >>> self._my_endpoint, self._cert_path = charm_tracing_config( + ... self.tracing, self._cert_path) + >>> @property + >>> def my_endpoint(self): + >>> return self._my_endpoint + >>> @property + >>> def cert_path(self): + >>> return self._cert_path + + """ + if not endpoint_requirer.is_ready(): + return None, None + + endpoint = endpoint_requirer.get_endpoint("otlp_http") + if not endpoint: + return None, None + + is_https = endpoint.startswith("https://") + + if is_https: + if cert_path is None: + raise TracingError("Cannot send traces to an https endpoint without a certificate.") + elif not Path(cert_path).exists(): + # if endpoint is https BUT we don't have a server_cert yet: + # disable charm tracing until we do to prevent tls errors + return None, None + return endpoint, str(cert_path) + else: + return endpoint, None diff --git a/lib/charms/tls_certificates_interface/v3/tls_certificates.py b/lib/charms/tls_certificates_interface/v3/tls_certificates.py index cbdd80d..aa4704c 100644 --- a/lib/charms/tls_certificates_interface/v3/tls_certificates.py +++ b/lib/charms/tls_certificates_interface/v3/tls_certificates.py @@ -111,6 +111,7 @@ def _on_certificate_request(self, event: CertificateCreationRequestEvent) -> Non ca=ca_certificate, chain=[ca_certificate, certificate], relation_id=event.relation_id, + recommended_expiry_notification_time=720, ) def _on_certificate_revocation_request(self, event: CertificateRevocationRequestEvent) -> None: @@ -276,13 +277,13 @@ def _on_all_certificates_invalidated(self, event: AllCertificatesInvalidatedEven """ # noqa: D405, D410, D411, D214, D416 import copy +import ipaddress import json import logging import uuid from contextlib import suppress from dataclasses import dataclass from datetime import datetime, timedelta, timezone -from ipaddress import IPv4Address from typing import List, Literal, Optional, Union from cryptography import x509 @@ -316,7 +317,7 @@ def _on_all_certificates_invalidated(self, event: AllCertificatesInvalidatedEven # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 10 +LIBPATCH = 17 PYDEPS = ["cryptography", "jsonschema"] @@ -453,11 +454,35 @@ class ProviderCertificate: ca: str chain: List[str] revoked: bool + expiry_time: datetime + expiry_notification_time: Optional[datetime] = None def chain_as_pem(self) -> str: """Return full certificate chain as a PEM string.""" return "\n\n".join(reversed(self.chain)) + def to_json(self) -> str: + """Return the object as a JSON string. + + Returns: + str: JSON representation of the object + """ + return json.dumps( + { + "relation_id": self.relation_id, + "application_name": self.application_name, + "csr": self.csr, + "certificate": self.certificate, + "ca": self.ca, + "chain": self.chain, + "revoked": self.revoked, + "expiry_time": self.expiry_time.isoformat(), + "expiry_notification_time": self.expiry_notification_time.isoformat() + if self.expiry_notification_time + else None, + } + ) + class CertificateAvailableEvent(EventBase): """Charm Event triggered when a TLS certificate is available.""" @@ -682,21 +707,49 @@ def _get_closest_future_time( ) -def _get_certificate_expiry_time(certificate: str) -> Optional[datetime]: - """Extract expiry time from a certificate string. +def calculate_expiry_notification_time( + validity_start_time: datetime, + expiry_time: datetime, + provider_recommended_notification_time: Optional[int], + requirer_recommended_notification_time: Optional[int], +) -> datetime: + """Calculate a reasonable time to notify the user about the certificate expiry. + + It takes into account the time recommended by the provider and by the requirer. + Time recommended by the provider is preferred, + then time recommended by the requirer, + then dynamically calculated time. Args: - certificate (str): x509 certificate as a string + validity_start_time: Certificate validity time + expiry_time: Certificate expiry time + provider_recommended_notification_time: + Time in hours prior to expiry to notify the user. + Recommended by the provider. + requirer_recommended_notification_time: + Time in hours prior to expiry to notify the user. + Recommended by the requirer. Returns: - Optional[datetime]: Expiry datetime or None + datetime: Time to notify the user about the certificate expiry. """ - try: - certificate_object = x509.load_pem_x509_certificate(data=certificate.encode()) - return certificate_object.not_valid_after_utc - except ValueError: - logger.warning("Could not load certificate.") - return None + if provider_recommended_notification_time is not None: + provider_recommended_notification_time = abs(provider_recommended_notification_time) + provider_recommendation_time_delta = ( + expiry_time - timedelta(hours=provider_recommended_notification_time) + ) + if validity_start_time < provider_recommendation_time_delta: + return provider_recommendation_time_delta + + if requirer_recommended_notification_time is not None: + requirer_recommended_notification_time = abs(requirer_recommended_notification_time) + requirer_recommendation_time_delta = ( + expiry_time - timedelta(hours=requirer_recommended_notification_time) + ) + if validity_start_time < requirer_recommendation_time_delta: + return requirer_recommendation_time_delta + calculated_hours = (expiry_time - validity_start_time).total_seconds() / (3600 * 3) + return expiry_time - timedelta(hours=calculated_hours) def generate_ca( @@ -965,6 +1018,8 @@ def generate_csr( # noqa: C901 organization: Optional[str] = None, email_address: Optional[str] = None, country_name: Optional[str] = None, + state_or_province_name: Optional[str] = None, + locality_name: Optional[str] = None, private_key_password: Optional[bytes] = None, sans: Optional[List[str]] = None, sans_oid: Optional[List[str]] = None, @@ -983,6 +1038,8 @@ def generate_csr( # noqa: C901 organization (str): Name of organization. email_address (str): Email address. country_name (str): Country Name. + state_or_province_name (str): State or Province Name. + locality_name (str): Locality Name. private_key_password (bytes): Private key password sans (list): Use sans_dns - this will be deprecated in a future release List of DNS subject alternative names (keeping it for now for backward compatibility) @@ -1008,13 +1065,19 @@ def generate_csr( # noqa: C901 subject_name.append(x509.NameAttribute(x509.NameOID.EMAIL_ADDRESS, email_address)) if country_name: subject_name.append(x509.NameAttribute(x509.NameOID.COUNTRY_NAME, country_name)) + if state_or_province_name: + subject_name.append( + x509.NameAttribute(x509.NameOID.STATE_OR_PROVINCE_NAME, state_or_province_name) + ) + if locality_name: + subject_name.append(x509.NameAttribute(x509.NameOID.LOCALITY_NAME, locality_name)) csr = x509.CertificateSigningRequestBuilder(subject_name=x509.Name(subject_name)) _sans: List[x509.GeneralName] = [] if sans_oid: _sans.extend([x509.RegisteredID(x509.ObjectIdentifier(san)) for san in sans_oid]) if sans_ip: - _sans.extend([x509.IPAddress(IPv4Address(san)) for san in sans_ip]) + _sans.extend([x509.IPAddress(ipaddress.ip_address(san)) for san in sans_ip]) if sans: _sans.extend([x509.DNSName(san) for san in sans]) if sans_dns: @@ -1030,6 +1093,13 @@ def generate_csr( # noqa: C901 return signed_certificate.public_bytes(serialization.Encoding.PEM) +def get_sha256_hex(data: str) -> str: + """Calculate the hash of the provided data and return the hexadecimal representation.""" + digest = hashes.Hash(hashes.SHA256()) + digest.update(data.encode()) + return digest.finalize().hex() + + def csr_matches_certificate(csr: str, cert: str) -> bool: """Check if a CSR matches a certificate. @@ -1039,25 +1109,16 @@ def csr_matches_certificate(csr: str, cert: str) -> bool: Returns: bool: True/False depending on whether the CSR matches the certificate. """ - try: - csr_object = x509.load_pem_x509_csr(csr.encode("utf-8")) - cert_object = x509.load_pem_x509_certificate(cert.encode("utf-8")) - - if csr_object.public_key().public_bytes( - encoding=serialization.Encoding.PEM, - format=serialization.PublicFormat.SubjectPublicKeyInfo, - ) != cert_object.public_key().public_bytes( - encoding=serialization.Encoding.PEM, - format=serialization.PublicFormat.SubjectPublicKeyInfo, - ): - return False - if ( - csr_object.public_key().public_numbers().n # type: ignore[union-attr] - != cert_object.public_key().public_numbers().n # type: ignore[union-attr] - ): - return False - except ValueError: - logger.warning("Could not load certificate or CSR.") + csr_object = x509.load_pem_x509_csr(csr.encode("utf-8")) + cert_object = x509.load_pem_x509_certificate(cert.encode("utf-8")) + + if csr_object.public_key().public_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PublicFormat.SubjectPublicKeyInfo, + ) != cert_object.public_key().public_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PublicFormat.SubjectPublicKeyInfo, + ): return False return True @@ -1135,6 +1196,7 @@ def _add_certificate( certificate_signing_request: str, ca: str, chain: List[str], + recommended_expiry_notification_time: Optional[int] = None, ) -> None: """Add certificate to relation data. @@ -1144,6 +1206,8 @@ def _add_certificate( certificate_signing_request (str): Certificate Signing Request ca (str): CA Certificate chain (list): CA Chain + recommended_expiry_notification_time (int): + Time in hours before the certificate expires to notify the user. Returns: None @@ -1161,6 +1225,7 @@ def _add_certificate( "certificate_signing_request": certificate_signing_request, "ca": ca, "chain": chain, + "recommended_expiry_notification_time": recommended_expiry_notification_time, } provider_relation_data = self._load_app_relation_data(relation) provider_certificates = provider_relation_data.get("certificates", []) @@ -1227,6 +1292,7 @@ def set_relation_certificate( ca: str, chain: List[str], relation_id: int, + recommended_expiry_notification_time: Optional[int] = None, ) -> None: """Add certificates to relation data. @@ -1236,6 +1302,8 @@ def set_relation_certificate( ca (str): CA Certificate chain (list): CA Chain relation_id (int): Juju relation ID + recommended_expiry_notification_time (int): + Recommended time in hours before the certificate expires to notify the user. Returns: None @@ -1257,6 +1325,7 @@ def set_relation_certificate( certificate_signing_request=certificate_signing_request.strip(), ca=ca.strip(), chain=[cert.strip() for cert in chain], + recommended_expiry_notification_time=recommended_expiry_notification_time, ) def remove_certificate(self, certificate: str) -> None: @@ -1310,6 +1379,13 @@ def get_provider_certificates( provider_relation_data = self._load_app_relation_data(relation) provider_certificates = provider_relation_data.get("certificates", []) for certificate in provider_certificates: + try: + certificate_object = x509.load_pem_x509_certificate( + data=certificate["certificate"].encode() + ) + except ValueError as e: + logger.error("Could not load certificate - Skipping: %s", e) + continue provider_certificate = ProviderCertificate( relation_id=relation.id, application_name=relation.app.name, @@ -1318,6 +1394,10 @@ def get_provider_certificates( ca=certificate["ca"], chain=certificate["chain"], revoked=certificate.get("revoked", False), + expiry_time=certificate_object.not_valid_after_utc, + expiry_notification_time=certificate.get( + "recommended_expiry_notification_time" + ), ) certificates.append(provider_certificate) return certificates @@ -1475,15 +1555,17 @@ def __init__( self, charm: CharmBase, relationship_name: str, - expiry_notification_time: int = 168, + expiry_notification_time: Optional[int] = None, ): """Generate/use private key and observes relation changed event. Args: charm: Charm object relationship_name: Juju relation name - expiry_notification_time (int): Time difference between now and expiry (in hours). - Used to trigger the CertificateExpiring event. Default: 7 days. + expiry_notification_time (int): Number of hours prior to certificate expiry. + Used to trigger the CertificateExpiring event. + This value is used as a recommendation only, + The actual value is calculated taking into account the provider's recommendation. """ super().__init__(charm, relationship_name) if not JujuVersion.from_environ().has_secrets: @@ -1544,9 +1626,25 @@ def get_provider_certificates(self) -> List[ProviderCertificate]: if not certificate: logger.warning("No certificate found in relation data - Skipping") continue + try: + certificate_object = x509.load_pem_x509_certificate(data=certificate.encode()) + except ValueError as e: + logger.error("Could not load certificate - Skipping: %s", e) + continue ca = provider_certificate_dict.get("ca") chain = provider_certificate_dict.get("chain", []) csr = provider_certificate_dict.get("certificate_signing_request") + recommended_expiry_notification_time = provider_certificate_dict.get( + "recommended_expiry_notification_time" + ) + expiry_time = certificate_object.not_valid_after_utc + validity_start_time = certificate_object.not_valid_before_utc + expiry_notification_time = calculate_expiry_notification_time( + validity_start_time=validity_start_time, + expiry_time=expiry_time, + provider_recommended_notification_time=recommended_expiry_notification_time, + requirer_recommended_notification_time=self.expiry_notification_time, + ) if not csr: logger.warning("No CSR found in relation data - Skipping") continue @@ -1559,6 +1657,8 @@ def get_provider_certificates(self) -> List[ProviderCertificate]: ca=ca, chain=chain, revoked=revoked, + expiry_time=expiry_time, + expiry_notification_time=expiry_notification_time, ) provider_certificates.append(provider_certificate) return provider_certificates @@ -1708,13 +1808,9 @@ def get_expiring_certificates(self) -> List[ProviderCertificate]: expiring_certificates: List[ProviderCertificate] = [] for requirer_csr in self.get_certificate_signing_requests(fulfilled_only=True): if cert := self._find_certificate_in_relation_data(requirer_csr.csr): - expiry_time = _get_certificate_expiry_time(cert.certificate) - if not expiry_time: + if not cert.expiry_time or not cert.expiry_notification_time: continue - expiry_notification_time = expiry_time - timedelta( - hours=self.expiry_notification_time - ) - if datetime.now(timezone.utc) > expiry_notification_time: + if datetime.now(timezone.utc) > cert.expiry_notification_time: expiring_certificates.append(cert) return expiring_certificates @@ -1774,9 +1870,15 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None: ] for certificate in provider_certificates: if certificate.csr in requirer_csrs: + csr_in_sha256_hex = get_sha256_hex(certificate.csr) if certificate.revoked: with suppress(SecretNotFoundError): - secret = self.model.get_secret(label=f"{LIBID}-{certificate.csr}") + logger.debug( + "Removing secret with label %s", + f"{LIBID}-{csr_in_sha256_hex}", + ) + secret = self.model.get_secret( + label=f"{LIBID}-{csr_in_sha256_hex}") secret.remove_all_revisions() self.on.certificate_invalidated.emit( reason="revoked", @@ -1787,16 +1889,24 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None: ) else: try: - secret = self.model.get_secret(label=f"{LIBID}-{certificate.csr}") - secret.set_content({"certificate": certificate.certificate}) + logger.debug( + "Setting secret with label %s", f"{LIBID}-{csr_in_sha256_hex}" + ) + secret = self.model.get_secret(label=f"{LIBID}-{csr_in_sha256_hex}") + secret.set_content( + {"certificate": certificate.certificate, "csr": certificate.csr} + ) secret.set_info( - expire=self._get_next_secret_expiry_time(certificate.certificate), + expire=self._get_next_secret_expiry_time(certificate), ) except SecretNotFoundError: + logger.debug( + "Creating new secret with label %s", f"{LIBID}-{csr_in_sha256_hex}" + ) secret = self.charm.unit.add_secret( - {"certificate": certificate.certificate}, - label=f"{LIBID}-{certificate.csr}", - expire=self._get_next_secret_expiry_time(certificate.certificate), + {"certificate": certificate.certificate, "csr": certificate.csr}, + label=f"{LIBID}-{csr_in_sha256_hex}", + expire=self._get_next_secret_expiry_time(certificate), ) self.on.certificate_available.emit( certificate_signing_request=certificate.csr, @@ -1805,7 +1915,7 @@ def _on_relation_changed(self, event: RelationChangedEvent) -> None: chain=certificate.chain, ) - def _get_next_secret_expiry_time(self, certificate: str) -> Optional[datetime]: + def _get_next_secret_expiry_time(self, certificate: ProviderCertificate) -> Optional[datetime]: """Return the expiry time or expiry notification time. Extracts the expiry time from the provided certificate, calculates the @@ -1813,17 +1923,18 @@ def _get_next_secret_expiry_time(self, certificate: str) -> Optional[datetime]: the future. Args: - certificate: x509 certificate + certificate: ProviderCertificate object Returns: Optional[datetime]: None if the certificate expiry time cannot be read, next expiry time otherwise. """ - expiry_time = _get_certificate_expiry_time(certificate) - if not expiry_time: + if not certificate.expiry_time or not certificate.expiry_notification_time: return None - expiry_notification_time = expiry_time - timedelta(hours=self.expiry_notification_time) - return _get_closest_future_time(expiry_notification_time, expiry_time) + return _get_closest_future_time( + certificate.expiry_notification_time, + certificate.expiry_time, + ) def _on_relation_broken(self, event: RelationBrokenEvent) -> None: """Handle Relation Broken Event. @@ -1857,27 +1968,26 @@ def _on_secret_expired(self, event: SecretExpiredEvent) -> None: """ if not event.secret.label or not event.secret.label.startswith(f"{LIBID}-"): return - csr = event.secret.label[len(f"{LIBID}-") :] + csr = event.secret.get_content()["csr"] provider_certificate = self._find_certificate_in_relation_data(csr) if not provider_certificate: # A secret expired but we did not find matching certificate. Cleaning up event.secret.remove_all_revisions() return - expiry_time = _get_certificate_expiry_time(provider_certificate.certificate) - if not expiry_time: + if not provider_certificate.expiry_time: # A secret expired but matching certificate is invalid. Cleaning up event.secret.remove_all_revisions() return - if datetime.now(timezone.utc) < expiry_time: + if datetime.now(timezone.utc) < provider_certificate.expiry_time: logger.warning("Certificate almost expired") self.on.certificate_expiring.emit( certificate=provider_certificate.certificate, - expiry=expiry_time.isoformat(), + expiry=provider_certificate.expiry_time.isoformat(), ) event.secret.set_info( - expire=_get_certificate_expiry_time(provider_certificate.certificate), + expire=provider_certificate.expiry_time, ) else: logger.warning("Certificate is expired") diff --git a/metadata.yaml b/metadata.yaml index 055421a..8b88c19 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -64,6 +64,8 @@ requires: limit: 1 provides: + tracing-provider: + interface: tracing logging-provider: interface: loki_push_api grafana-dashboards-provider: diff --git a/requirements.txt b/requirements.txt index 34cc3b0..c61b5bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,11 +4,12 @@ # FIXME: Packing the charm with 2.2.0+139.gd011d92 will not include dependencies in PYDEPS key: # https://chat.charmhub.io/charmhub/pl/wngp665ycjnb78ar9ojrfhxjkr # That's why we are including cosl here until the bug in charmcraft is solved -cosl -ops > 2.5.0 +cosl >= 0.0.19 +ops >= 2.5.0 pydantic < 2 requests kubernetes +cryptography lightkube lightkube-models cryptography diff --git a/src/charm.py b/src/charm.py index c016c3e..1d86171 100755 --- a/src/charm.py +++ b/src/charm.py @@ -7,13 +7,12 @@ import json import logging import pathlib -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Union import yaml from charms.loki_k8s.v1.loki_push_api import LokiPushApiProvider from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointConsumer from charms.tempo_k8s.v1.charm_tracing import trace_charm -from charms.tempo_k8s.v2.tracing import TracingEndpointRequirer from cosl import GrafanaDashboard from grafana_agent import CONFIG_PATH, GrafanaAgentCharm from ops.main import main @@ -25,8 +24,9 @@ @trace_charm( - tracing_endpoint="tracing_endpoint", - server_cert="server_ca_cert_path", + # implemented in GrafanaAgentCharm + tracing_endpoint="_tracing_endpoint", + server_cert="_server_ca_cert_path", extra_types=( GrafanaAgentCharm, LokiPushApiProvider, @@ -46,6 +46,10 @@ class GrafanaAgentK8sCharm(GrafanaAgentCharm): {"logging-consumer"}, # or {"grafana-cloud-config"}, ], + "tracing-provider": [ # must be paired with: + {"tracing"}, # or + {"grafana-cloud-config"}, + ], "grafana-dashboards-consumer": [ # must be paired with: {"grafana-dashboards-provider"}, # or {"grafana-cloud-config"}, @@ -58,21 +62,17 @@ def __init__(self, *args): self.unit.set_ports(self._http_listen_port, self._grpc_listen_port) self._scrape = MetricsEndpointConsumer(self) - self.framework.observe( - self._scrape.on.targets_changed, # pyright: ignore - self.on_scrape_targets_changed, - ) - self._loki_provider = LokiPushApiProvider( self, relation_name="logging-provider", port=self._http_listen_port ) - self.framework.observe( self._loki_provider.on.loki_push_api_alert_rules_changed, # pyright: ignore self._on_loki_push_api_alert_rules_changed, ) - - self._tracing = TracingEndpointRequirer(self, protocols=["otlp_http"]) + self.framework.observe( + self._scrape.on.targets_changed, # pyright: ignore + self.on_scrape_targets_changed, + ) self.framework.observe( self.on["grafana-dashboards-consumer"].relation_changed, self._on_dashboards_changed, @@ -81,7 +81,6 @@ def __init__(self, *args): self.on["grafana-dashboards-consumer"].relation_broken, self._on_dashboards_changed, ) - self.framework.observe( self.on.agent_pebble_ready, # pyright: ignore self._on_agent_pebble_ready, @@ -168,11 +167,6 @@ def logs_rules(self) -> Dict[str, Any]: """Return a list of logging rules.""" return self._loki_provider.alerts - @property - def is_k8s(self) -> bool: - """Is this a k8s charm.""" - return True - @property def is_ready(self): """Checks if the charm is ready for configuration.""" @@ -244,18 +238,6 @@ def run(self, cmd: List[str]): """ self._container.exec(cmd).wait() - @property - def tracing_endpoint(self) -> Optional[str]: - """Otlp http endpoint for charm instrumentation.""" - if self._tracing.is_ready(): - return self._tracing.get_endpoint("otlp_http") - return None - - @property - def server_ca_cert_path(self) -> Optional[str]: - """Server CA certificate path for tls tracing.""" - return self._ca_path if self.cert.enabled else None - if __name__ == "__main__": main(GrafanaAgentK8sCharm) diff --git a/src/grafana_agent.py b/src/grafana_agent.py index ecdf168..6403866 100644 --- a/src/grafana_agent.py +++ b/src/grafana_agent.py @@ -7,11 +7,12 @@ import os import re import shutil +import socket import subprocess from collections import namedtuple from dataclasses import dataclass from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Set, Union, get_args import yaml from charms.certificate_transfer_interface.v0.certificate_transfer import ( @@ -32,6 +33,14 @@ from charms.prometheus_k8s.v1.prometheus_remote_write import ( PrometheusRemoteWriteConsumer, ) +from charms.tempo_k8s.v2.tracing import ( + ReceiverProtocol, + TracingEndpointProvider, + TracingEndpointRequirer, + TransportProtocolType, + charm_tracing_config, + receiver_protocol_to_transport_protocol, +) from cosl import MandatoryRelationPairs from ops.charm import CharmBase from ops.model import ActiveStatus, BlockedStatus, WaitingStatus @@ -78,12 +87,27 @@ class GrafanaAgentCharm(CharmBase): _name = "agent" _http_listen_port = 3500 _grpc_listen_port = 3600 - # TODO Change to a more suitable location once the snap gets access (#216). + _cert_path = "/tmp/agent/grafana-agent.pem" _key_path = "/tmp/agent/grafana-agent.key" _ca_path = "/usr/local/share/ca-certificates/grafana-agent-operator.crt" _ca_folder_path = "/usr/local/share/ca-certificates" + # mapping from tempo-supported receivers to the receiver ports to be opened on the grafana-agent host + _tracing_receivers_ports: Dict[ReceiverProtocol, int] = { + # OTLP receiver: see + # https://github.com/open-telemetry/opentelemetry-collector/tree/v0.96.0/receiver/otlpreceiver + "otlp_http": 4318, + "otlp_grpc": 4317, + # Jaeger receiver: see + # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/v0.96.0/receiver/jaegerreceiver + "jaeger_grpc": 14250, + "jaeger_thrift_http": 14268, + # Zipkin receiver: see + # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/v0.96.0/receiver/zipkinreceiver + "zipkin": 9411, + } + # Pairs of (incoming, [outgoing]) relation names. If any 'incoming' is joined without at least # one matching 'outgoing', the charm will block. Without any matching outgoing relation we may # incur data loss. @@ -145,10 +169,43 @@ def __init__(self, *args): key="grafana-agent-cert", ) - self.framework.observe(self.cert.on.cert_changed, self._on_cert_changed) # pyright: ignore + self._tracing = TracingEndpointRequirer( + self, + protocols=[ + "otlp_http", # for charm traces + "otlp_grpc", # for forwarding workload traces + ], + ) + self._tracing_provider = TracingEndpointProvider( + self, + # TODO: do we have an external url via ingress? + relation_name="tracing-provider", + ) + + self._tracing_endpoint, self._server_ca_cert_path = charm_tracing_config( + self._tracing, self._ca_path + ) self._cloud = GrafanaCloudConfigRequirer(self) + self.framework.observe( + self._tracing.on.endpoint_changed, # pyright: ignore + self._on_tracing_endpoint_changed, + ) + self.framework.observe( + self._tracing.on.endpoint_removed, # pyright: ignore + self._on_tracing_endpoint_removed, + ) + self.framework.observe( + self._tracing_provider.on.request, # pyright: ignore + self._on_tracing_provider_request, + ) + self.framework.observe( + self._tracing_provider.on.broken, # pyright: ignore + self._on_tracing_provider_broken, + ) + self.framework.observe(self.cert.on.cert_changed, self._on_cert_changed) # pyright: ignore + self.framework.observe( self._cloud.on.cloud_config_available, # pyright: ignore self._on_cloud_config_available, @@ -198,16 +255,73 @@ def __init__(self, *args): self.framework.observe(self.on[outgoing].relation_joined, self._update_status) self.framework.observe(self.on[outgoing].relation_broken, self._update_status) + def _get_tracing_receiver_url(self, protocol: ReceiverProtocol): + scheme = "http" + try: + if self._charm.cert.available: # type: ignore + scheme = "https" + except AttributeError: + pass + + # assume we're doing this in-model, since this charm doesn't have ingress + if receiver_protocol_to_transport_protocol[protocol] == TransportProtocolType.grpc: + return f"{socket.getfqdn()}:{self._tracing_receivers_ports[protocol]}" + return f"{scheme}://{socket.getfqdn()}:{self._tracing_receivers_ports[protocol]}" + + @property + def _force_enabled_tracing_protocols(self) -> Set[ReceiverProtocol]: + """Return a list of tracing receivers that have been force-enabled (by config).""" + return { + receiver + for receiver in get_args(ReceiverProtocol) + if self.config.get(f"always_enable_{receiver}") + } + + @property + def _requested_tracing_protocols(self) -> Set[ReceiverProtocol]: + """All receiver protocols that have been requested by our related apps.""" + return set(self._tracing_provider.requested_protocols()).union( + self._force_enabled_tracing_protocols + ) + + def _update_tracing_provider(self): + self._tracing_provider.publish_receivers( + tuple( + (protocol, self._get_tracing_receiver_url(protocol)) + for protocol in self._requested_tracing_protocols + ) + ) + def _on_cert_changed(self, _event): """Event handler for cert change.""" self._update_config() self._update_ca() self._update_status() + self._update_tracing_provider() + + def _on_tracing_endpoint_changed(self, _event) -> None: + """Event handler for the tracing endpoint-changed event.""" + self._update_config() + self._update_status() + self._update_tracing_provider() - def _on_mandatory_relation_event(self, _event=None): - """Event handler for any mandatory relation event.""" + def _on_tracing_endpoint_removed(self, _event) -> None: + """Event handler for the tracing endpoint-removed event.""" self._update_config() self._update_status() + self._update_tracing_provider() + + def _on_tracing_provider_request(self, _event) -> None: + """Event handler for the tracing-provider request event.""" + self._update_config() + self._update_status() + self._update_tracing_provider() + + def _on_tracing_provider_broken(self, _event) -> None: + """Event handler for the tracing-provider broken event.""" + self._update_config() + self._update_status() + self._update_tracing_provider() def _on_upgrade_charm(self, _event=None): """Refresh alerts if the charm is updated.""" @@ -215,6 +329,7 @@ def _on_upgrade_charm(self, _event=None): self._update_loki_alerts() self._update_config() self._update_status() + self._update_tracing_provider() def _on_loki_push_api_endpoint_joined(self, _event=None): """Rebuild the config with correct Loki sinks.""" @@ -230,14 +345,17 @@ def _on_config_changed(self, _event=None): """Rebuild the config.""" self._update_config() self._update_status() + self._update_tracing_provider() def _on_cloud_config_available(self, _) -> None: logger.info("cloud config available") self._update_config() + self._update_tracing_provider() def _on_cloud_config_revoked(self, _) -> None: logger.info("cloud config revoked") self._update_config() + self._update_tracing_provider() def _on_cert_transfer_available(self, event: CertificateTransferAvailableEvent): cert_filename = ( @@ -254,11 +372,6 @@ def _on_cert_transfer_removed(self, event: CertificateTransferRemovedEvent): self.run(["update-ca-certificates", "--fresh"]) # Abstract Methods - @property - def is_k8s(self) -> bool: - """Is this a k8s charm.""" - raise NotImplementedError("Please override the is_k8s method") - def agent_version_output(self) -> str: """Gets the raw output from `agent -version`.""" raise NotImplementedError("Please override the agent_version_output method") @@ -347,7 +460,7 @@ def _update_metrics_alerts(self): alerts_func=self.metrics_rules, reload_func=self._remote_write.reload_alerts, mapping=self.metrics_rules_paths, - copy_files=self.is_k8s, # TODO: This is ugly + copy_files=True, ) def _update_loki_alerts(self): @@ -431,7 +544,7 @@ def _update_status(self, *_): """Determine the charm status based on relation health and grafana-agent service readiness. This is a centralized status setter. Status should only be calculated here, or, if you need - to temporarily change the status (e.g. during snap install), always call this method after + to temporarily change the status (e.g. during install), always call this method after so the status is re-calculated (exceptions: on_install, on_remove). TODO: Rework this when "compound status" is implemented https://github.com/canonical/operator/issues/665 @@ -472,10 +585,12 @@ def _update_status(self, *_): # to inform via the Active message that they are in fact missing ("soft" warning). cos_rels = { "send-remote-write", + "tracing", "logging-consumer", "grafana-dashboards-provider", } - missing_rels = ( + # sorting is so that the order doesn't keep flapping on each hook depending on + missing_rels = sorted( cos_rels.difference(active_relations) if cos_rels.intersection(active_relations) else set() @@ -504,26 +619,9 @@ def _update_config(self) -> None: subprocess.run(["update-ca-certificates", "--fresh"], check=True) else: # Delete TLS related files if they exist - try: - self.read_file(self._cert_path) - except (FileNotFoundError, PathError): - pass - else: - self.delete_file(self._cert_path) - - try: - self.read_file(self._key_path) - except (FileNotFoundError, PathError): - pass - else: - self.delete_file(self._key_path) - - try: - self.read_file(self._ca_path) - except (FileNotFoundError, PathError): - pass - else: - self.delete_file(self._ca_path) + self._delete_file_if_exists(self._cert_path) + self._delete_file_if_exists(self._key_path) + self._delete_file_if_exists(self._ca_path) # charm container CA cert Path(self._ca_path).unlink(missing_ok=True) @@ -555,6 +653,14 @@ def _update_config(self) -> None: self.status.update_config = None + def _delete_file_if_exists(self, file_path): + try: + self.read_file(file_path) + except (FileNotFoundError, PathError): + pass + else: + self.delete_file(file_path) + def _on_dashboard_status_changed(self, _event=None): """Re-initialize dashboards to forward.""" # TODO: add constructor arg for `inject_dropdowns=False` instead of 'private' method? @@ -563,12 +669,23 @@ def _on_dashboard_status_changed(self, _event=None): ) # noqa self._update_status() - def _enrich_endpoints(self) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: - """Add TLS information to Prometheus and Loki endpoints.""" + def _enhance_endpoints_with_tls(self, endpoints) -> List[Dict[str, Any]]: + for endpoint in endpoints: + endpoint["tls_config"] = { + "insecure_skip_verify": self.model.config.get("tls_insecure_skip_verify") + } + return endpoints + + def _prometheus_endpoints_with_tls(self) -> List[Dict[str, Any]]: + """Add TLS information to Prometheus endpoints. + + Also, injects the grafana-cloud-integrator endpoints into those we get from juju relations. + FIXME: these should be separate concerns. + """ prometheus_endpoints: List[Dict[str, Any]] = self._remote_write.endpoints if self._cloud.prometheus_ready: - prometheus_endpoint = {"url": self._cloud.prometheus_url} + prometheus_endpoint: Dict[str, Any] = {"url": self._cloud.prometheus_url} if self._cloud.credentials: prometheus_endpoint["basic_auth"] = { "username": self._cloud.credentials.username, @@ -576,6 +693,14 @@ def _enrich_endpoints(self) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]] } prometheus_endpoints.append(prometheus_endpoint) + return self._enhance_endpoints_with_tls(prometheus_endpoints) + + def _loki_endpoints_with_tls(self) -> List[Dict[str, Any]]: + """Add TLS information to Loki endpoints. + + Also, injects the grafana-cloud-integrator endpoints into those we get from juju relations. + FIXME: these should be separate concerns. + """ loki_endpoints = self._loki_consumer.loki_endpoints if self._cloud.loki_ready: @@ -592,11 +717,37 @@ def _enrich_endpoints(self) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]] } loki_endpoints.append(loki_endpoint) - for endpoint in prometheus_endpoints + loki_endpoints: - endpoint["tls_config"] = { - "insecure_skip_verify": self.model.config.get("tls_insecure_skip_verify") + return self._enhance_endpoints_with_tls(loki_endpoints) + + def _tempo_endpoints_with_tls(self) -> List[Dict[str, Any]]: + """Add TLS information to Tempo endpoints. + + Also, injects the grafana-cloud-integrator endpoints into those we get from juju relations. + FIXME: these should be separate concerns. + """ + tempo_endpoints = [] + if self._tracing.is_ready(): + tempo_endpoints.append( + { + # outgoing traces are all otlp/grpc + # cit: While Tempo and the Agent both can ingest in multiple formats, + # the Agent only exports in OTLP gRPC and HTTP. + "endpoint": self._tracing.get_endpoint("otlp_grpc"), + "insecure": False if self.cert.enabled else True, + } + ) + + if self._cloud.tempo_ready: + tempo_endpoint: Dict[str, Any] = { + "endpoint": self._cloud.tempo_url, } - return prometheus_endpoints, loki_endpoints + if self._cloud.credentials: + tempo_endpoint["basic_auth"] = { + "username": self._cloud.credentials.username, + "password": self._cloud.credentials.password, + } + tempo_endpoints.append(tempo_endpoint) + return self._enhance_endpoints_with_tls(tempo_endpoints) def _cli_args(self) -> str: """Return the cli arguments to pass to agent. @@ -616,8 +767,6 @@ def _generate_config(self) -> Dict[str, Any]: Returns: A yaml string with grafana agent config """ - prometheus_endpoints, _ = self._enrich_endpoints() - config = { "server": self._server_config, "integrations": self._integrations_config, @@ -627,11 +776,12 @@ def _generate_config(self) -> Dict[str, Any]: { "name": "agent_scraper", "scrape_configs": self.metrics_jobs(), - "remote_write": prometheus_endpoints, + "remote_write": self._prometheus_endpoints_with_tls(), } ], }, "logs": self._loki_config, + "traces": self._tempo_config, } return config @@ -662,8 +812,6 @@ def _integrations_config(self) -> dict: # Align the "job" name with those of prometheus_scrape job_name = f"juju_{juju_model}_{juju_model_uuid}_{juju_application}_self-monitoring" - prometheus_endpoints, _ = self._enrich_endpoints() - conf = { "agent": { "enabled": True, @@ -705,11 +853,89 @@ def _integrations_config(self) -> dict: }, ], }, - "prometheus_remote_write": prometheus_endpoints, + "prometheus_remote_write": self._prometheus_endpoints_with_tls(), **self._additional_integrations, } return conf + @property + def _tracing_receivers(self) -> Dict[str, Union[Any, List[Any]]]: + """Receivers configuration for tracing. + + Returns: + a dict with the receivers config. + """ + receivers_set = self._requested_tracing_protocols + + if not receivers_set: + logger.warning("No tempo receivers enabled: grafana-agent cannot ingest traces.") + return {} + + if self.cert.enabled: + base_receiver_config: Dict[str, Union[str, Dict]] = { + "tls": { + "ca_file": str(self._ca_path), + "cert_file": str(self._cert_path), + "key_file": str(self._key_path), + "min_version": "", + } + } + else: + base_receiver_config = {} + + def _receiver_config(protocol: str): + endpoint = "0.0.0.0:" + str(self._tracing_receivers_ports[protocol]) # type: ignore + receiver_config = base_receiver_config.copy() + receiver_config["endpoint"] = endpoint + return receiver_config + + config = {} + + if "zipkin" in receivers_set: + config["zipkin"] = _receiver_config("zipkin") + + otlp_config = {} + if "otlp_http" in receivers_set: + otlp_config["http"] = _receiver_config("otlp_http") + if "otlp_grpc" in receivers_set: + otlp_config["grpc"] = _receiver_config("otlp_grpc") + if otlp_config: + config["otlp"] = {"protocols": otlp_config} + + jaeger_config = {} + if "jaeger_thrift_http" in receivers_set: + jaeger_config["thrift_http"] = _receiver_config("jaeger_thrift_http") + if "jaeger_grpc" in receivers_set: + jaeger_config["grpc"] = _receiver_config("jaeger_grpc") + if jaeger_config: + config["jaeger"] = {"protocols": jaeger_config} + + return config + + @property + def _tempo_config(self) -> Dict[str, Union[Any, List[Any]]]: + """The tracing section of the config. + + Returns: + a dict with the tracing config. + """ + endpoints = self._tempo_endpoints_with_tls() + receivers = self._tracing_receivers + + if not receivers: + # pushing a config with an empty receivers section will cause gagent to error out + return {} + + return { + "configs": [ + { + "name": "tempo", + "remote_write": endpoints, + "receivers": receivers, + } + ] + } + @property def _loki_config(self) -> Dict[str, Union[Any, List[Any]]]: """Modifies the loki section of the config. @@ -717,14 +943,12 @@ def _loki_config(self) -> Dict[str, Union[Any, List[Any]]]: Returns: a dict with Loki config """ - _, loki_endpoints = self._enrich_endpoints() - configs = [] if self._loki_consumer.loki_endpoints: configs.append( { "name": "push_api_server", - "clients": loki_endpoints, + "clients": self._loki_endpoints_with_tls(), "scrape_configs": [ { "job_name": "loki", diff --git a/tests/scenario/conftest.py b/tests/scenario/conftest.py index 0ded13b..e658e08 100644 --- a/tests/scenario/conftest.py +++ b/tests/scenario/conftest.py @@ -1,19 +1,14 @@ import shutil -from pathlib import Path, PosixPath +from pathlib import Path import pytest -from tests.scenario.helpers import CHARM_ROOT - - -class Vroot(PosixPath): - def clean(self) -> None: - shutil.rmtree(self) - shutil.copytree(CHARM_ROOT / "src", self / "src") +CHARM_ROOT = Path(__file__).parent.parent.parent @pytest.fixture def vroot(tmp_path) -> Path: - vroot = Vroot(str(tmp_path.absolute())) - vroot.clean() - return vroot + root = Path(str(tmp_path.absolute())) + shutil.rmtree(root) + shutil.copytree(CHARM_ROOT / "src", root / "src") + return root diff --git a/tests/scenario/helpers.py b/tests/scenario/helpers.py deleted file mode 100644 index 1e23b17..0000000 --- a/tests/scenario/helpers.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright 2023 Canonical Ltd. -# See LICENSE file for licensing details. -from pathlib import Path - -import yaml - -CHARM_ROOT = Path(__file__).parent.parent.parent - - -def get_charm_meta(charm_type) -> dict: - raw_meta = (CHARM_ROOT / "metadata").with_suffix(".yaml").read_text() - return yaml.safe_load(raw_meta) diff --git a/tests/scenario/test_dashboard_transfer.py b/tests/scenario/test_dashboard_transfer.py new file mode 100644 index 0000000..c299c23 --- /dev/null +++ b/tests/scenario/test_dashboard_transfer.py @@ -0,0 +1,51 @@ +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. +import json + +from charm import GrafanaAgentK8sCharm +from cosl import GrafanaDashboard +from scenario import Container, Context, Relation, State + + +def encode_as_dashboard(dct: dict): + return GrafanaDashboard._serialize(json.dumps(dct).encode("utf-8")) + + +def test_dashboard_propagation(vroot): + # This test verifies that if the charm receives a dashboard via the requirer databag, + # it is correctly transferred to the provider databag. + + content_in = encode_as_dashboard({"hello": "world"}) + expected = { + "charm": "some-test-charm", + "title": "file:some-mock-dashboard", + "content": content_in, + } + data = { + "templates": { + "file:some-mock-dashboard": {"charm": "some-test-charm", "content": content_in} + } + } + consumer = Relation( + "grafana-dashboards-consumer", + relation_id=1, + remote_app_data={"dashboards": json.dumps(data)}, + ) + + provider = Relation("grafana-dashboards-provider", relation_id=2) + + ctx = Context(charm_type=GrafanaAgentK8sCharm, charm_root=vroot) + state = State( + relations=[consumer, provider], + leader=True, + containers=[Container("agent", can_connect=True)], + ) + + with ctx.manager( + state=state, + event=consumer.changed_event, + ) as mgr: + dash = mgr.charm.dashboards[0] + assert dash["charm"] == expected["charm"] + assert dash["title"] == expected["title"] + assert dash["content"] == expected["content"]._deserialize() diff --git a/tests/scenario/test_k8s/conftest.py b/tests/scenario/test_k8s/conftest.py deleted file mode 100644 index e0a1786..0000000 --- a/tests/scenario/test_k8s/conftest.py +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright 2022 Canonical Ltd. -# See LICENSE file for licensing details. -from unittest.mock import patch - -import pytest - - -@pytest.fixture(autouse=True) -def patch_all(): - with patch("charm.KubernetesServicePatch", lambda x, y: None): - yield diff --git a/tests/scenario/test_k8s/test_dashboard_transfer.py b/tests/scenario/test_k8s/test_dashboard_transfer.py deleted file mode 100644 index 2e37c28..0000000 --- a/tests/scenario/test_k8s/test_dashboard_transfer.py +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright 2021 Canonical Ltd. -# See LICENSE file for licensing details. -import json - -from charm import GrafanaAgentK8sCharm -from cosl import GrafanaDashboard -from scenario import Container, Context, Relation, State - - -def encode_as_dashboard(dct: dict): - return GrafanaDashboard._serialize(json.dumps(dct).encode("utf-8")) - - -class TestDashboardPropagation: - """TestDashboardPropagation checks that dashboard propagation works in the K8s charm.""" - - def test_dashboard_propagation(self, vroot): - # This test verifies that if the charm receives a dashboard via the requirer databag, - # it is correctly transferred to the provider databag. - - content_in = encode_as_dashboard({"hello": "world"}) - expected = { - "charm": "some-test-charm", - "title": "file:some-mock-dashboard", - "content": content_in, - } - data = { - "templates": { - "file:some-mock-dashboard": {"charm": "some-test-charm", "content": content_in} - } - } - consumer = Relation( - "grafana-dashboards-consumer", - relation_id=1, - remote_app_data={"dashboards": json.dumps(data)}, - ) - - provider = Relation("grafana-dashboards-provider", relation_id=2) - - ctx = Context(charm_type=GrafanaAgentK8sCharm, charm_root=vroot) - state = State( - relations=[consumer, provider], - leader=True, - containers=[Container("agent", can_connect=True)], - ) - - def post_event(charm): - dash = charm.dashboards[0] - assert dash["charm"] == expected["charm"] - assert dash["title"] == expected["title"] - assert dash["content"] == expected["content"]._deserialize() - - ctx.run( - state=state, - event=consumer.changed_event, - post_event=post_event, - ) diff --git a/tests/scenario/test_setup_statuses.py b/tests/scenario/test_setup_statuses.py index 943f845..781052b 100644 --- a/tests/scenario/test_setup_statuses.py +++ b/tests/scenario/test_setup_statuses.py @@ -1,88 +1,30 @@ # Copyright 2023 Canonical Ltd. # See LICENSE file for licensing details. -import dataclasses -from typing import Type -from unittest.mock import patch import charm -import grafana_agent -import pytest -from ops import BlockedStatus, UnknownStatus, WaitingStatus, pebble -from ops.testing import CharmType +from ops import BlockedStatus, UnknownStatus, pebble from scenario import Container, Context, ExecOutput, State -from tests.scenario.helpers import get_charm_meta - -@pytest.fixture(params=["k8s"]) -def substrate(request): - return request.param - - -@pytest.fixture -def charm_type(substrate) -> Type[CharmType]: - return {"k8s": charm.GrafanaAgentK8sCharm}[substrate] - - -@pytest.fixture -def mock_cfg_path(tmp_path): - return tmp_path / "foo.yaml" - - -@dataclasses.dataclass -class _MockProc: - returncode: int = 0 - stdout = "" - - -def _subp_run_mock(*a, **kw): - return _MockProc(0) - - -@pytest.fixture(autouse=True) -def patch_all(substrate, mock_cfg_path): - if substrate == "lxd": - grafana_agent.CONFIG_PATH = mock_cfg_path - with patch("subprocess.run", _subp_run_mock): - yield - yield - - -def test_install(charm_type, substrate, vroot): +def test_install(vroot): context = Context( - charm_type, - meta=get_charm_meta(charm_type), + charm.GrafanaAgentK8sCharm, charm_root=vroot, ) out = context.run("install", State()) + assert out.unit_status == UnknownStatus() - if substrate == "lxd": - assert out.unit_status == ("maintenance", "Installing grafana-agent snap") - - else: - assert out.unit_status == ("unknown", "") - -def test_start(charm_type, substrate, vroot): +def test_start(vroot): context = Context( - charm_type, - meta=get_charm_meta(charm_type), + charm.GrafanaAgentK8sCharm, charm_root=vroot, ) out = context.run("start", State()) + assert out.unit_status == UnknownStatus() - if substrate == "lxd": - assert not grafana_agent.CONFIG_PATH.exists(), "config file written on start" - assert out.unit_status == WaitingStatus("waiting for agent to start") - - else: - assert out.unit_status == UnknownStatus() - - -def test_charm_start_with_container(charm_type, substrate, vroot): - if substrate == "lxd": - pytest.skip("k8s-only test") +def test_charm_start_with_container(vroot): agent = Container( name="agent", can_connect=True, @@ -90,15 +32,14 @@ def test_charm_start_with_container(charm_type, substrate, vroot): ) context = Context( - charm_type, - meta=get_charm_meta(charm_type), + charm.GrafanaAgentK8sCharm, charm_root=vroot, ) state = State(containers=[agent]) out = context.run(agent.pebble_ready_event, state) assert out.unit_status == BlockedStatus( - "Missing incoming ('requires') relation: metrics-endpoint|logging-provider|grafana-dashboards-consumer" + "Missing incoming ('requires') relation: metrics-endpoint|logging-provider|tracing-provider|grafana-dashboards-consumer" ) agent_out = out.get_container("agent") assert agent_out.services["agent"].current == pebble.ServiceStatus.ACTIVE diff --git a/tests/scenario/test_start_statuses.py b/tests/scenario/test_start_statuses.py index 0a40dfb..407fe03 100644 --- a/tests/scenario/test_start_statuses.py +++ b/tests/scenario/test_start_statuses.py @@ -1,36 +1,15 @@ # Copyright 2023 Canonical Ltd. # See LICENSE file for licensing details. import dataclasses -import inspect from pathlib import Path -from typing import Type -from unittest.mock import patch -import charm -import pytest -import yaml +from charm import GrafanaAgentK8sCharm from ops import pebble -from ops.testing import CharmType from scenario import Container, Context, ExecOutput, State CHARM_ROOT = Path(__file__).parent.parent.parent -@pytest.fixture(params=["k8s"]) -def substrate(request): - return request.param - - -@pytest.fixture -def charm_type(substrate) -> Type[CharmType]: - return {"k8s": charm.GrafanaAgentK8sCharm}[substrate] - - -@pytest.fixture -def placeholder_cfg_path(tmp_path): - return tmp_path / "foo.yaml" - - @dataclasses.dataclass class _MockProc: returncode: int = 0 @@ -41,53 +20,25 @@ def _subp_run_mock(*a, **kw): return _MockProc(0) -@pytest.fixture(autouse=True) -def patch_all(substrate, placeholder_cfg_path): - if substrate == "lxd": - with patch("subprocess.run", _subp_run_mock), patch( - "grafana_agent.CONFIG_PATH", placeholder_cfg_path - ): - yield - yield - - -@pytest.fixture -def charm_meta(substrate, charm_type) -> dict: - fname = {"k8s": "metadata"}[substrate] - - charm_source_path = Path(inspect.getfile(charm_type)) - charm_root = charm_source_path.parent.parent - - raw_meta = (charm_root / fname).with_suffix(".yaml").read_text() - return yaml.safe_load(raw_meta) - - -def test_install(charm_type, charm_meta, substrate, vroot): +def test_install(vroot): ctx = Context( - charm_type=charm_type, - meta=charm_meta, + charm_type=GrafanaAgentK8sCharm, charm_root=vroot, ) out = ctx.run(state=State(), event="install") - - if substrate == "lxd": - assert out.unit_status == ("maintenance", "Installing grafana-agent snap") - - else: - assert out.unit_status == ("unknown", "") + assert out.unit_status == ("unknown", "") -def test_start(charm_type, charm_meta, substrate, vroot, placeholder_cfg_path): +def test_start(vroot): ctx = Context( - charm_type=charm_type, - meta=charm_meta, + charm_type=GrafanaAgentK8sCharm, charm_root=vroot, ) out = ctx.run(state=State(), event="start") assert out.unit_status.name == "unknown" -def test_charm_start_with_container(charm_type, charm_meta, substrate, vroot): +def test_charm_start_with_container(vroot): agent = Container( name="agent", can_connect=True, @@ -95,8 +46,7 @@ def test_charm_start_with_container(charm_type, charm_meta, substrate, vroot): ) ctx = Context( - charm_type=charm_type, - meta=charm_meta, + charm_type=GrafanaAgentK8sCharm, charm_root=vroot, ) out = ctx.run(state=State(containers=[agent]), event=agent.pebble_ready_event) diff --git a/tests/scenario/test_tracing_integration.py b/tests/scenario/test_tracing_integration.py new file mode 100644 index 0000000..1dcd670 --- /dev/null +++ b/tests/scenario/test_tracing_integration.py @@ -0,0 +1,165 @@ +from unittest.mock import patch + +import pytest +import scenario +import yaml +from charm import GrafanaAgentK8sCharm +from charms.tempo_k8s.v1.charm_tracing import charm_tracing_disabled +from charms.tempo_k8s.v2.tracing import Receiver, TracingProviderAppData, TracingRequirerAppData +from grafana_agent import CONFIG_PATH +from ops import pebble + + +@pytest.fixture +def ctx(vroot): + with charm_tracing_disabled(): + with patch("socket.getfqdn", new=lambda: "localhost"): + yield scenario.Context(GrafanaAgentK8sCharm, charm_root=vroot) + + +@pytest.fixture +def base_state(): + yield scenario.State( + leader=True, + containers=[ + scenario.Container( + "agent", + can_connect=True, + # set it to inactive so we can detect when an event has caused it to start + service_status={"agent": pebble.ServiceStatus.INACTIVE}, + ) + ], + ) + + +def test_tracing_relation(ctx, base_state): + # GIVEN a tracing relation over the tracing-provider endpoint + tracing = scenario.Relation( + "tracing-provider", + remote_app_data=TracingRequirerAppData(receivers=["otlp_http", "otlp_grpc"]).dump(), + ) + + state = base_state.replace(relations=[tracing]) + # WHEN we process any setup event for the relation + state_out = ctx.run(tracing.changed_event, state) + + agent = state_out.get_container("agent") + + # THEN the agent has started + assert agent.services["agent"].is_running() + # AND the grafana agent config has a traces config section + fs = agent.get_filesystem(ctx) + gagent_config = fs.joinpath(*CONFIG_PATH.strip("/").split("/")) + assert gagent_config.exists() + yml = yaml.safe_load(gagent_config.read_text()) + assert yml["traces"]["configs"][0], yml.get("traces", "") + + +def test_tracing_relations_in_and_out(ctx, base_state): + # GIVEN a tracing relation over the tracing-provider endpoint and one over tracing + tracing_provider = scenario.Relation( + "tracing-provider", + remote_app_data=TracingRequirerAppData(receivers=["otlp_http", "otlp_grpc"]).dump(), + ) + tracing = scenario.Relation( + "tracing", + remote_app_data=TracingProviderAppData( + receivers=[ + Receiver(protocol={"name": "otlp_grpc", "type": "grpc"}, url="http:foo.com:1111") + ] + ).dump(), + ) + + state = base_state.replace(relations=[tracing, tracing_provider]) + # WHEN we process any setup event for the relation + state_out = ctx.run(tracing.changed_event, state) + + agent = state_out.get_container("agent") + + # THEN the agent has started + assert agent.services["agent"].is_running() + # AND the grafana agent config has a traces config section + fs = agent.get_filesystem(ctx) + gagent_config = fs.joinpath(*CONFIG_PATH.strip("/").split("/")) + assert gagent_config.exists() + yml = yaml.safe_load(gagent_config.read_text()) + assert yml["traces"] + + +def test_tracing_relation_passthrough(ctx, base_state): + # GIVEN a tracing relation over the tracing-provider endpoint and one over tracing + tracing_provider = scenario.Relation( + "tracing-provider", + remote_app_data=TracingRequirerAppData(receivers=["otlp_http", "otlp_grpc"]).dump(), + ) + tracing = scenario.Relation( + "tracing", + remote_app_data=TracingProviderAppData( + receivers=[ + Receiver(protocol={"name": "otlp_grpc", "type": "grpc"}, url="http:foo.com:1111") + ] + ).dump(), + ) + + state = base_state.replace(relations=[tracing, tracing_provider]) + # WHEN we process any setup event for the relation + state_out = ctx.run(tracing.changed_event, state) + + # THEN we act as a tracing provider for 'tracing-provider', and as requirer for 'tracing' + tracing_out = TracingRequirerAppData.load(state_out.get_relations("tracing")[0].local_app_data) + tracing_provider_out = TracingProviderAppData.load( + state_out.get_relations("tracing-provider")[0].local_app_data + ) + assert set(tracing_out.receivers) == {"otlp_grpc", "otlp_http"} + otlp_grpc_provider_def = [ + r for r in tracing_provider_out.receivers if r.protocol.name == "otlp_grpc" + ][0] + otlp_http_provider_def = [ + r for r in tracing_provider_out.receivers if r.protocol.name == "otlp_http" + ][0] + assert otlp_grpc_provider_def.url == "localhost:4317" + assert otlp_http_provider_def.url == "http://localhost:4318" + + +@pytest.mark.parametrize( + "force_enable", + ( + ["zipkin", "jaeger_thrift_http", "jaeger_grpc"], + ["zipkin", "jaeger_thrift_http"], + ["jaeger_thrift_http"], + ), +) +def test_tracing_relation_passthrough_with_force_enable(ctx, base_state, force_enable): + # GIVEN a tracing relation over the tracing-provider endpoint and one over tracing + tracing_provider = scenario.Relation( + "tracing-provider", + remote_app_data=TracingRequirerAppData(receivers=["otlp_http", "otlp_grpc"]).dump(), + ) + tracing = scenario.Relation( + "tracing", + remote_app_data=TracingProviderAppData( + receivers=[ + Receiver(protocol={"name": "otlp_grpc", "type": "grpc"}, url="http:foo.com:1111") + ] + ).dump(), + ) + + # AND given we're configured to always enable some protocols + state = base_state.replace( + config={f"always_enable_{proto}": True for proto in force_enable}, + relations=[tracing, tracing_provider], + ) + # WHEN we process any setup event for the relation + state_out = ctx.run(tracing.changed_event, state) + + # THEN we act as a tracing provider for 'tracing-provider', and as requirer for 'tracing' + tracing_out = TracingRequirerAppData.load(state_out.get_relations("tracing")[0].local_app_data) + tracing_provider_out = TracingProviderAppData.load( + state_out.get_relations("tracing-provider")[0].local_app_data + ) + + # we still only request otlp grpc and http for charm traces and because gagent funnels all to grpc + assert set(tracing_out.receivers) == {"otlp_grpc", "otlp_http"} + # but we provide all + providing_protocols = {r.protocol.name for r in tracing_provider_out.receivers} + assert providing_protocols == {"otlp_grpc", "otlp_http"}.union(force_enable) diff --git a/tests/unit/test_scrape_configuration.py b/tests/unit/test_scrape_configuration.py index bb54919..6884e58 100644 --- a/tests/unit/test_scrape_configuration.py +++ b/tests/unit/test_scrape_configuration.py @@ -184,6 +184,7 @@ def test_remote_write_configuration(self): }, "server": {"log_level": "info"}, "logs": {}, + "traces": {}, } config = yaml.safe_load(agent_container.pull("/etc/grafana-agent.yaml").read()) diff --git a/tox.ini b/tox.ini index 4144204..3a671d4 100644 --- a/tox.ini +++ b/tox.ini @@ -33,7 +33,7 @@ deps = black ruff commands = - ruff --fix {[vars]all_path} + ruff check --fix {[vars]all_path} black {[vars]all_path} [testenv:lint] @@ -45,7 +45,7 @@ deps = codespell commands = codespell . --skip .git --skip .tox --skip build --skip lib --skip venv --skip .mypy_cache --skip *.svg - ruff {[vars]all_path} + ruff check {[vars]all_path} black --check --diff {[vars]all_path} [testenv:static-{charm}] @@ -85,7 +85,7 @@ deps = -r{toxinidir}/requirements.txt pytest cosl - ops-scenario > 4 + ops-scenario >= 6.1.5 commands = pytest -vv --tb native --log-cli-level=INFO -s {posargs} {[vars]tst_path}/scenario --ignore {[vars]tst_path}/scenario/test_k8s