From 5b6e32c7ab1e58c248c30265d2f5c474e2db755d Mon Sep 17 00:00:00 2001 From: Mateusz Kulewicz Date: Tue, 26 Nov 2024 13:06:49 +0100 Subject: [PATCH] Split endpoints into charm-tracing and workload-tracing (#358) * Split endpoints into charm-tracing and workload-tracing; integration tests * formatting * Adjust method import to better naming in lib * Add juju topology to workload traces * Don't use shared helpers yet * Review remarks: duplicate method, upper-case constants --- charmcraft.yaml | 9 +- requirements.txt | 1 + src/charm.py | 60 ++++++++---- tests/integration/helpers.py | 105 +++++++++++++++++++++ tests/integration/test_workload_tracing.py | 58 ++++++++++++ tox.ini | 2 + 6 files changed, 216 insertions(+), 19 deletions(-) create mode 100644 tests/integration/test_workload_tracing.py diff --git a/charmcraft.yaml b/charmcraft.yaml index 23349ffd..e158f609 100644 --- a/charmcraft.yaml +++ b/charmcraft.yaml @@ -71,9 +71,16 @@ requires: description: | Receive oauth server's info and a set of client credentials. This relation can be used to integrate grafana with an oAuth2/OIDC Provider. - tracing: + charm-tracing: + description: | + Enables sending charm traces to a distributed tracing backend such as Tempo. + limit: 1 interface: tracing + workload-tracing: + description: | + Enables sending workload traces to a distributed tracing backend such as Tempo. limit: 1 + interface: tracing provides: metrics-endpoint: diff --git a/requirements.txt b/requirements.txt index 7eac6b3a..82ab42bc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ cryptography # lib/charms/tempo_k8s/v1/charm_tracing.py opentelemetry-exporter-otlp-proto-http pydantic +cosl \ No newline at end of file diff --git a/src/charm.py b/src/charm.py index f202209a..5ef1239e 100755 --- a/src/charm.py +++ b/src/charm.py @@ -27,9 +27,10 @@ import socket import string import time +from cosl import JujuTopology from io import StringIO from pathlib import Path -from typing import Any, Callable, Dict, cast, Optional +from typing import Any, Callable, Dict, cast from urllib.parse import urlparse import subprocess @@ -72,7 +73,7 @@ UpgradeCharmEvent, ) from charms.tempo_coordinator_k8s.v0.charm_tracing import trace_charm -from charms.tempo_coordinator_k8s.v0.tracing import TracingEndpointRequirer +from charms.tempo_coordinator_k8s.v0.tracing import TracingEndpointRequirer, charm_tracing_config from ops.framework import StoredState from ops.main import main from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, OpenedPort @@ -124,8 +125,8 @@ @trace_charm( - tracing_endpoint="tracing_endpoint", - server_cert="server_ca_cert_path", + tracing_endpoint="charm_tracing_endpoint", + server_cert="server_cert", extra_types=[ AuthRequirer, CertHandler, @@ -159,6 +160,7 @@ def __init__(self, *args): self._grafana_config_ini_hash = None self._grafana_datasources_hash = None self._stored.set_default(admin_password="") + self._topology = JujuTopology.from_charm(self) # -- cert_handler self.cert_handler = CertHandler( @@ -193,7 +195,15 @@ def __init__(self, *args): self.cert_handler.on.cert_changed, # pyright: ignore ], ) - self.tracing = TracingEndpointRequirer(self, protocols=["otlp_http", "otlp_grpc"]) + self.charm_tracing = TracingEndpointRequirer( + self, relation_name="charm-tracing", protocols=["otlp_http"] + ) + self.workload_tracing = TracingEndpointRequirer( + self, relation_name="workload-tracing", protocols=["otlp_grpc"] + ) + self.charm_tracing_endpoint, self.server_cert = charm_tracing_config( + self.charm_tracing, CA_CERT_PATH + ) # -- standard events self.framework.observe(self.on.install, self._on_install) @@ -265,6 +275,15 @@ def __init__(self, *args): self.grafana_auth_requirer.on.auth_conf_available, # pyright: ignore self._on_grafana_auth_conf_available, ) + # -- workload tracing observations + self.framework.observe( + self.workload_tracing.on.endpoint_changed, # type: ignore + self._on_workload_tracing_endpoint_changed, + ) + self.framework.observe( + self.workload_tracing.on.endpoint_removed, # type: ignore + self._on_workload_tracing_endpoint_removed, + ) # -- cert_handler observations self.framework.observe( @@ -745,6 +764,14 @@ def _on_database_broken(self, event: RelationBrokenEvent) -> None: # Cleanup the config file self._configure() + def _on_workload_tracing_endpoint_changed(self, event: RelationChangedEvent) -> None: + """Adds workload tracing information to grafana's config.""" + self._configure() + + def _on_workload_tracing_endpoint_removed(self, event: RelationBrokenEvent) -> None: + """Removes workload tracing information from grafana's config.""" + self._configure() + def _generate_grafana_config(self) -> str: """Generate a database configuration for Grafana. @@ -775,7 +802,7 @@ def _generate_tracing_config(self) -> str: A string containing the required tracing information to be stubbed into the config file. """ - tracing = self.tracing + tracing = self.workload_tracing if not tracing.is_ready(): return "" endpoint = tracing.get_endpoint("otlp_grpc") @@ -1054,6 +1081,15 @@ def _build_layer(self) -> Layer: } ) + if self.workload_tracing.is_ready(): + topology = self._topology + extra_info.update( + { + "OTEL_RESOURCE_ATTRIBUTES": f"juju_application={topology.application},juju_model={topology.model}" + + f",juju_model_uuid={topology.model_uuid},juju_unit={topology.unit},juju_charm={topology.charm_name}", + } + ) + layer = Layer( { "summary": "grafana-k8s layer", @@ -1546,18 +1582,6 @@ def _on_oauth_info_changed(self, event: OAuthInfoChangedEvent) -> None: """Event handler for the oauth_info_changed event.""" self._configure() - @property - def tracing_endpoint(self) -> Optional[str]: - """Tempo endpoint for charm tracing.""" - if self.tracing.is_ready(): - return self.tracing.get_endpoint("otlp_http") - return None - - @property - def server_ca_cert_path(self) -> Optional[str]: - """Server CA certificate path for TLS tracing.""" - return str(CA_CERT_PATH) if self.cert_handler.enabled else None - if __name__ == "__main__": main(GrafanaCharm, use_juju_for_storage=True) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index ea33ee50..884c4b71 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -3,16 +3,22 @@ # See LICENSE file for licensing details. import asyncio import grp +import json import logging import subprocess from pathlib import Path from typing import Tuple +import requests import yaml from asyncstdlib import functools from pytest_operator.plugin import OpsTest from urllib.parse import urlparse from workload import Grafana +from juju.application import Application +from juju.unit import Unit +from minio import Minio +from tenacity import retry, stop_after_attempt, wait_exponential logger = logging.getLogger(__name__) @@ -377,6 +383,105 @@ async def curl(ops_test: OpsTest, *, cert_dir: str, cert_path: str, ip_addr: str return stdout +async def deploy_and_configure_minio(ops_test: OpsTest) -> None: + """Deploy and set up minio and s3-integrator needed for s3-like storage backend in the HA charms.""" + config = { + "access-key": "accesskey", + "secret-key": "secretkey", + } + await ops_test.model.deploy("minio", channel="edge", trust=True, config=config) + await ops_test.model.wait_for_idle(apps=["minio"], status="active", timeout=2000) + minio_addr = await unit_address(ops_test, "minio", 0) + + mc_client = Minio( + f"{minio_addr}:9000", + access_key="accesskey", + secret_key="secretkey", + secure=False, + ) + + # create tempo bucket + found = mc_client.bucket_exists("tempo") + if not found: + mc_client.make_bucket("tempo") + + # configure s3-integrator + s3_integrator_app: Application = ops_test.model.applications["s3-integrator"] + s3_integrator_leader: Unit = s3_integrator_app.units[0] + + await s3_integrator_app.set_config( + { + "endpoint": f"minio-0.minio-endpoints.{ops_test.model.name}.svc.cluster.local:9000", + "bucket": "tempo", + } + ) + + action = await s3_integrator_leader.run_action("sync-s3-credentials", **config) + action_result = await action.wait() + assert action_result.status == "completed" + + +async def deploy_tempo_cluster(ops_test: OpsTest): + """Deploys tempo in its HA version together with minio and s3-integrator.""" + tempo_app = "tempo" + worker_app = "tempo-worker" + tempo_worker_charm_url, worker_channel = "tempo-worker-k8s", "edge" + tempo_coordinator_charm_url, coordinator_channel = "tempo-coordinator-k8s", "edge" + await ops_test.model.deploy( + tempo_worker_charm_url, application_name=worker_app, channel=worker_channel, trust=True + ) + await ops_test.model.deploy( + tempo_coordinator_charm_url, + application_name=tempo_app, + channel=coordinator_channel, + trust=True, + ) + await ops_test.model.deploy("s3-integrator", channel="edge") + + await ops_test.model.integrate(tempo_app + ":s3", "s3-integrator" + ":s3-credentials") + await ops_test.model.integrate(tempo_app + ":tempo-cluster", worker_app + ":tempo-cluster") + + await deploy_and_configure_minio(ops_test) + async with ops_test.fast_forward(): + await ops_test.model.wait_for_idle( + apps=[tempo_app, worker_app, "s3-integrator"], + status="active", + timeout=2000, + idle_period=30, + ) + + +def get_traces(tempo_host: str, service_name="tracegen-otlp_http", tls=True): + """Get traces directly from Tempo REST API.""" + url = f"{'https' if tls else 'http'}://{tempo_host}:3200/api/search?tags=service.name={service_name}" + req = requests.get( + url, + verify=False, + ) + assert req.status_code == 200 + traces = json.loads(req.text)["traces"] + return traces + + +@retry(stop=stop_after_attempt(15), wait=wait_exponential(multiplier=1, min=4, max=10)) +async def get_traces_patiently(tempo_host, service_name="tracegen-otlp_http", tls=True): + """Get traces directly from Tempo REST API, but also try multiple times. + + Useful for cases when Tempo might not return the traces immediately (its API is known for returning data in + random order). + """ + traces = get_traces(tempo_host, service_name=service_name, tls=tls) + assert len(traces) > 0 + return traces + + +async def get_application_ip(ops_test: OpsTest, app_name: str) -> str: + """Get the application IP address.""" + status = await ops_test.model.get_status() + app = status["applications"][app_name] + return app.public_address + + class ModelConfigChange: """Context manager for temporarily changing a model config option.""" diff --git a/tests/integration/test_workload_tracing.py b/tests/integration/test_workload_tracing.py new file mode 100644 index 00000000..9232342d --- /dev/null +++ b/tests/integration/test_workload_tracing.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +# Copyright 2021 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging +from pathlib import Path + +import pytest +import yaml +from helpers import ( + oci_image, + check_grafana_is_ready, + deploy_tempo_cluster, + get_traces_patiently, + get_application_ip, +) + +logger = logging.getLogger(__name__) + +METADATA = yaml.safe_load(Path("./charmcraft.yaml").read_text()) +APP_NAME = "grafana" +TEMPO_APP_NAME = "tempo" +RESOURCES = { + "grafana-image": oci_image("./charmcraft.yaml", "grafana-image"), + "litestream-image": oci_image("./charmcraft.yaml", "litestream-image"), +} + + +async def test_setup_env(ops_test): + await ops_test.model.set_config({"logging-config": "=WARNING; unit=DEBUG"}) + + +@pytest.mark.abort_on_fail +async def test_workload_tracing_is_present(ops_test, grafana_charm): + logger.info("deploying tempo cluster") + await deploy_tempo_cluster(ops_test) + + logger.info("deploying local charm") + await ops_test.model.deploy( + grafana_charm, resources=RESOURCES, application_name=APP_NAME, trust=True + ) + await ops_test.model.wait_for_idle( + apps=[APP_NAME], status="active", timeout=300, wait_for_exact_units=1 + ) + + await check_grafana_is_ready(ops_test, APP_NAME, 0) + # we relate _only_ workload tracing not to confuse with charm traces + await ops_test.model.add_relation( + "{}:workload-tracing".format(APP_NAME), "{}:tracing".format(TEMPO_APP_NAME) + ) + await ops_test.model.wait_for_idle(apps=[APP_NAME], status="active") + + # Verify workload traces from grafana are ingested into Tempo + assert await get_traces_patiently( + await get_application_ip(ops_test, TEMPO_APP_NAME), + service_name=f"{APP_NAME}", + tls=False, + ) diff --git a/tox.ini b/tox.ini index b28352ce..38e93ab8 100644 --- a/tox.ini +++ b/tox.ini @@ -112,6 +112,8 @@ deps = pytest-playwright lightkube git+https://github.com/canonical/iam-bundle@671a33419869fab1fe63d81873b41f6b181498e3#egg=oauth_tools + cosl + minio commands = playwright install pytest -vv --tb native --log-cli-level=INFO --color=yes -s {posargs} {toxinidir}/tests/integration