Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DPE-4066] HA interface #317

Merged
merged 23 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ options:
The port on which the pgbouncer prometheus exporter serves metrics.
type: int

vip:
description: |
Virtual IP to use to front pgbouncer units. Used only in case of external node connection.
type: string

pool_mode:
default: session
description: |
Expand Down
29 changes: 22 additions & 7 deletions lib/charms/grafana_agent/v0/cos_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,14 +213,18 @@ def __init__(self, *args):
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, List, Optional, Set, Tuple, Union

import pydantic
from cosl import GrafanaDashboard, JujuTopology
from cosl.rules import AlertRules
from ops.charm import RelationChangedEvent
from ops.framework import EventBase, EventSource, Object, ObjectEvents
from ops.model import Relation
from ops.testing import CharmType

try:
import pydantic.v1 as pydantic
except ImportError:
import pydantic

if TYPE_CHECKING:
try:
from typing import TypedDict
Expand All @@ -234,9 +238,9 @@ class _MetricsEndpointDict(TypedDict):

LIBID = "dc15fa84cef84ce58155fb84f6c6213a"
LIBAPI = 0
LIBPATCH = 8
LIBPATCH = 9

PYDEPS = ["cosl", "pydantic < 2"]
PYDEPS = ["cosl", "pydantic"]

DEFAULT_RELATION_NAME = "cos-agent"
DEFAULT_PEER_RELATION_NAME = "peers"
Expand Down Expand Up @@ -721,8 +725,18 @@ def metrics_jobs(self) -> List[Dict]:
@property
def snap_log_endpoints(self) -> List[SnapEndpoint]:
"""Fetch logging endpoints exposed by related snaps."""
endpoints = []
endpoints_with_topology = self.snap_log_endpoints_with_topology
for endpoint, _ in endpoints_with_topology:
endpoints.append(endpoint)

return endpoints

@property
def snap_log_endpoints_with_topology(self) -> List[Tuple[SnapEndpoint, JujuTopology]]:
"""Fetch logging endpoints and charm topology for each related snap."""
plugs = []
for data, _ in self._remote_data:
for data, topology in self._remote_data:
targets = data.log_slots
if targets:
for target in targets:
Expand All @@ -733,15 +747,16 @@ def snap_log_endpoints(self) -> List[SnapEndpoint]:
"endpoints; this should not happen."
)
else:
plugs.append(target)
plugs.append((target, topology))

endpoints = []
for plug in plugs:
for plug, topology in plugs:
if ":" not in plug:
logger.error(f"invalid plug definition received: {plug}. Ignoring...")
else:
endpoint = SnapEndpoint(*plug.split(":"))
endpoints.append(endpoint)
endpoints.append((endpoint, topology))

return endpoints

@property
Expand Down
87 changes: 57 additions & 30 deletions lib/charms/tempo_k8s/v1/charm_tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,65 @@ def my_tracing_endpoint(self) -> Optional[str]:
provide an *absolute* path to the certificate file instead.
"""


def _remove_stale_otel_sdk_packages():
"""Hack to remove stale opentelemetry sdk packages from the charm's python venv.

See https://github.com/canonical/grafana-agent-operator/issues/146 and
https://bugs.launchpad.net/juju/+bug/2058335 for more context. This patch can be removed after
this juju issue is resolved and sufficient time has passed to expect most users of this library
have migrated to the patched version of juju. When this patch is removed, un-ignore rule E402 for this file in the pyproject.toml (see setting
[tool.ruff.lint.per-file-ignores] in pyproject.toml).

This only has an effect if executed on an upgrade-charm event.
"""
# all imports are local to keep this function standalone, side-effect-free, and easy to revert later
import os

if os.getenv("JUJU_DISPATCH_PATH") != "hooks/upgrade-charm":
return

import logging
import shutil
from collections import defaultdict

from importlib_metadata import distributions

otel_logger = logging.getLogger("charm_tracing_otel_patcher")
otel_logger.debug("Applying _remove_stale_otel_sdk_packages patch on charm upgrade")
# group by name all distributions starting with "opentelemetry_"
otel_distributions = defaultdict(list)
for distribution in distributions():
name = distribution._normalized_name # type: ignore
if name.startswith("opentelemetry_"):
otel_distributions[name].append(distribution)

otel_logger.debug(f"Found {len(otel_distributions)} opentelemetry distributions")

# If we have multiple distributions with the same name, remove any that have 0 associated files
for name, distributions_ in otel_distributions.items():
if len(distributions_) <= 1:
continue

otel_logger.debug(f"Package {name} has multiple ({len(distributions_)}) distributions.")
for distribution in distributions_:
if not distribution.files: # Not None or empty list
path = distribution._path # type: ignore
otel_logger.info(f"Removing empty distribution of {name} at {path}.")
shutil.rmtree(path)

otel_logger.debug("Successfully applied _remove_stale_otel_sdk_packages patch. ")


_remove_stale_otel_sdk_packages()


import functools
import inspect
import logging
import os
import shutil
from contextlib import contextmanager
from contextvars import Context, ContextVar, copy_context
from importlib.metadata import distributions
from pathlib import Path
from typing import (
Any,
Expand All @@ -199,14 +250,15 @@ def my_tracing_endpoint(self) -> Optional[str]:
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import Span, TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.trace import INVALID_SPAN, Tracer
from opentelemetry.trace import get_current_span as otlp_get_current_span
from opentelemetry.trace import (
INVALID_SPAN,
Tracer,
get_tracer,
get_tracer_provider,
set_span_in_context,
set_tracer_provider,
)
from opentelemetry.trace import get_current_span as otlp_get_current_span
from ops.charm import CharmBase
from ops.framework import Framework

Expand All @@ -219,7 +271,7 @@ def my_tracing_endpoint(self) -> Optional[str]:
# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version

LIBPATCH = 13
LIBPATCH = 14

PYDEPS = ["opentelemetry-exporter-otlp-proto-http==1.21.0"]

Expand Down Expand Up @@ -361,30 +413,6 @@ def _get_server_cert(
return server_cert


def _remove_stale_otel_sdk_packages():
"""Hack to remove stale opentelemetry sdk packages from the charm's python venv.

See https://github.com/canonical/grafana-agent-operator/issues/146 and
https://bugs.launchpad.net/juju/+bug/2058335 for more context. This patch can be removed after
this juju issue is resolved and sufficient time has passed to expect most users of this library
have migrated to the patched version of juju.

This only does something if executed on an upgrade-charm event.
"""
if os.getenv("JUJU_DISPATCH_PATH") == "hooks/upgrade-charm":
logger.debug("Executing _remove_stale_otel_sdk_packages patch on charm upgrade")
# Find any opentelemetry_sdk distributions
otel_sdk_distributions = list(distributions(name="opentelemetry_sdk"))
# If there is more than 1, inspect each and if it has 0 entrypoints, infer that it is stale
if len(otel_sdk_distributions) > 1:
for distribution in otel_sdk_distributions:
if len(distribution.entry_points) == 0:
# Distribution appears to be empty. Remove it
path = distribution._path # type: ignore
logger.debug(f"Removing empty opentelemetry_sdk distribution at: {path}")
shutil.rmtree(path)


def _setup_root_span_initializer(
charm_type: _CharmType,
tracing_endpoint_attr: str,
Expand Down Expand Up @@ -420,7 +448,6 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs):
# apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm.
# it could be trouble if someone ever decides to implement their own tracer parallel to
# ours and before the charm has inited. We assume they won't.
_remove_stale_otel_sdk_packages()
resource = Resource.create(
attributes={
"service.name": _service_name,
Expand Down
5 changes: 5 additions & 0 deletions metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,8 @@ requires:
interface: tracing
limit: 1
optional: true

ha:
interface: hacluster
limit: 1
optional: true
56 changes: 43 additions & 13 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,18 @@
from charms.tempo_k8s.v1.charm_tracing import trace_charm
from charms.tempo_k8s.v2.tracing import TracingEndpointRequirer
from jinja2 import Template
from ops import JujuVersion
from ops.charm import CharmBase, StartEvent
from ops.main import main
from ops.model import (
from ops import (
ActiveStatus,
BlockedStatus,
CharmBase,
JujuVersion,
MaintenanceStatus,
ModelError,
Relation,
StartEvent,
WaitingStatus,
)
from ops.main import main

from constants import (
APP_SCOPE,
Expand Down Expand Up @@ -65,6 +66,7 @@
)
from relations.backend_database import BackendDatabaseRequires
from relations.db import DbProvides
from relations.hacluster import HaCluster
from relations.peers import Peers
from relations.pgbouncer_provider import PgBouncerProvider
from upgrade import PgbouncerUpgrade, get_pgbouncer_dependencies_model
Expand Down Expand Up @@ -120,6 +122,7 @@ def __init__(self, *args):
self.legacy_db_relation = DbProvides(self, admin=False)
self.legacy_db_admin_relation = DbProvides(self, admin=True)
self.tls = PostgreSQLTLS(self, PEER_RELATION_NAME)
self.hacluster = HaCluster(self)

self.service_ids = list(range(self.instances_count))
self.pgb_services = [
Expand Down Expand Up @@ -507,8 +510,24 @@ def update_status(self):
self.unit.status = BlockedStatus("backend database relation not ready")
return

if self.hacluster.relation and not self._is_exposed:
self.unit.status = BlockedStatus("ha integration used without data-intgrator")
return

vip = self.config.get("vip")
if self.hacluster.relation and not vip:
self.unit.status = BlockedStatus("ha integration used without vip configuration")
return

if vip and not self._is_exposed:
self.unit.status = BlockedStatus("vip configuration without data-intgrator")
return

if self.check_pgb_running():
self.unit.status = ActiveStatus()
if self.unit.is_leader() and vip:
self.unit.status = ActiveStatus(f"VIP: {vip}")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Setting just the app status message here didn't show when testing locally, while setting a proper app status didn't clear up when blocking.

else:
self.unit.status = ActiveStatus()

def _on_config_changed(self, event) -> None:
"""Config changed handler.
Expand All @@ -521,19 +540,28 @@ def _on_config_changed(self, event) -> None:
event.defer()
return

old_vip = self.peers.app_databag.get("current_vip", "")
vip = self.config.get("vip", "")
vip_changed = old_vip != vip
if vip_changed and self._is_exposed:
self.hacluster.set_vip(self.config.get("vip"))

old_port = self.peers.app_databag.get("current_port")
port_changed = old_port != str(self.config["listen_port"])
if port_changed and self._is_exposed:
if self.unit.is_leader():
self.peers.app_databag["current_port"] = str(self.config["listen_port"])
# Open port
try:
if old_port:
self.unit.close_port("tcp", old_port)
self.unit.open_port("tcp", self.config["listen_port"])
self.unit.set_ports(self.config["listen_port"])
except ModelError:
logger.exception("failed to open port")

if self.unit.is_leader():
self.peers.app_databag["current_port"] = str(self.config["listen_port"])
if vip:
self.peers.app_databag["current_vip"] = str(vip)
else:
self.peers.app_databag.pop("current_vip", None)

# TODO hitting upgrade errors here due to secrets labels failing to set on non-leaders.
# deferring until the leader manages to set the label
try:
Expand All @@ -546,6 +574,9 @@ def _on_config_changed(self, event) -> None:
if self.backend.postgres:
self.render_prometheus_service()

if port_changed or vip_changed:
self.update_client_connection_info()

def check_pgb_running(self):
"""Checks that pgbouncer service is running, and updates status accordingly."""
prom_service = f"{PGB}-{self.app.name}-prometheus"
Expand Down Expand Up @@ -910,14 +941,13 @@ def unit_ip(self) -> str:
# Relation Utilities
# =====================

def update_client_connection_info(self, port: Optional[str] = None):
def update_client_connection_info(self):
Comment on lines -913 to +944
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really passed anywhere

"""Update ports in backend relations to match updated pgbouncer port."""
# Skip updates if backend.postgres doesn't exist yet.
if not self.backend.postgres or not self.unit.is_leader():
return

if port is None:
port = self.config["listen_port"]
port = self.config["listen_port"]

for relation in self.model.relations.get("db", []):
self.legacy_db_relation.update_connection_info(relation, port)
Expand Down
1 change: 1 addition & 0 deletions src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
BACKEND_RELATION_NAME = "backend-database"
PEER_RELATION_NAME = "pgb-peers"
CLIENT_RELATION_NAME = "database"
HACLUSTER_RELATION_NAME = "ha"

TLS_KEY_FILE = "key.pem"
TLS_CA_FILE = "ca.pem"
Expand Down
Loading