Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prometheus metrics collector implementation #385

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ Add the `health_check` applications to your `INSTALLED_APPS`:
'health_check.cache',
'health_check.storage',
'health_check.contrib.migrations',
'health_check.contrib.prometheus', # requires prometheus-client, transforms status into prometheus metrics
'health_check.contrib.celery', # requires celery
'health_check.contrib.celery_ping', # requires celery
'health_check.contrib.psutil', # disk and memory utilization; requires psutil
Expand Down Expand Up @@ -260,6 +261,30 @@ This should yield the following output:

Similar to the http version, a critical error will cause the command to quit with the exit code `1`.

## Prometheus support

The `health_check.contrib.prometheus` app provides a optional collector for the [prometheus-client](https://github.com/prometheus/client_python) library.

If you want to use it, add `health_check.contrib.prometheus` to your `INSTALLED_APPS`:

```python
INSTALLED_APPS = [
# ...
'health_check.contrib.prometheus',
]
```

When enabled, it will expose the health check status as a metric on the existing `/metrics` endpoint:

> # The errors returned by the health checks
> # TYPE django_health_check_errors gauge
> django_health_check_errors{identifier="DatabaseHealthCheck", critical_service="1"} 0.0
> django_health_check_errors{identifier="CustomHealthCheck", critical_service="0"} 1.0
> # The seconds taken by the health checks
> # TYPE django_health_check_duration_seconds gauge
> django_health_check_duration_seconds{identifier="DatabaseHealthCheck", critical_service="1"} 0.01
> django_health_check_duration_seconds{identifier="CustomHealthCheck", critical_service="0"} 1.0


## Other resources

Expand Down
4 changes: 4 additions & 0 deletions health_check/contrib/prometheus/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import django

if django.VERSION < (3, 2):
default_app_config = "health_check.contrib.prometheus.apps.HealthCheckConfig"
17 changes: 17 additions & 0 deletions health_check/contrib/prometheus/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from django.apps import AppConfig
from prometheus_client.core import REGISTRY

from health_check.plugins import plugin_dir

from .backends import PrometheusChecker
from .collector import DjangoHealthCheckCollector


class HealthCheckConfig(AppConfig):
name = "health_check.contrib.prometheus"

def ready(self):
collector = DjangoHealthCheckCollector()
collector.register_to(REGISTRY)

plugin_dir.register(PrometheusChecker)
22 changes: 22 additions & 0 deletions health_check/contrib/prometheus/backends.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from health_check.backends import BaseHealthCheckBackend
from health_check.mixins import CheckMixin


class Checker(CheckMixin):
pass


class PrometheusChecker(BaseHealthCheckBackend):
def check_status(self):
pass

def check_plugins(self):
self.errors = []
checker = Checker()

try:
checker.run_check()
except Exception as err:
self.add_error(err)

return checker.plugins
31 changes: 31 additions & 0 deletions health_check/contrib/prometheus/collector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from prometheus_client.core import GaugeMetricFamily

from .backends import PrometheusChecker


class DjangoHealthCheckCollector:
def collect(self):
django_health_check_errors = GaugeMetricFamily(
"django_health_check_errors",
"The errors returned by the health checks",
labels=["identifier", "kind"],
)
django_health_check_duration_seconds = GaugeMetricFamily(
"django_health_check_duration_seconds",
"The seconds taken by the health checks",
labels=["identifier", "kind"],
)

checker = PrometheusChecker()

for plugin in checker.check_plugins():
labels = [str(plugin.identifier()), "critical_service" if plugin.critical_service else "normal"]

django_health_check_errors.add_metric(labels, len(plugin.errors))
django_health_check_duration_seconds.add_metric(labels, plugin.time_taken)

yield django_health_check_errors
yield django_health_check_duration_seconds

def register_to(self, registry):
registry.register(self)
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ test =
pytest-django
celery
redis
prometheus-client
docs =
sphinx

Expand Down
74 changes: 74 additions & 0 deletions tests/test_prometheus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from unittest import mock

import pytest
from prometheus_client.core import CollectorRegistry

from health_check.backends import BaseHealthCheckBackend
from health_check.contrib.prometheus.collector import DjangoHealthCheckCollector
from health_check.exceptions import HealthCheckException
from health_check.mixins import CheckMixin
from health_check.plugins import plugin_dir


class FakePlugin(BaseHealthCheckBackend):
error = None

def check_status(self):
if self.error:
raise self.error


class TestDjangoHealthCheckCollector:
@pytest.fixture()
def plugin(self):
plugin_dir.reset()
plugin_dir.register(FakePlugin)
yield FakePlugin
plugin_dir.reset()

@pytest.fixture()
def collector(self):
return DjangoHealthCheckCollector()

def test_register_to(self, collector):
registry = CollectorRegistry(auto_describe=True)
collector.register_to(registry)

with mock.patch.object(collector, "collect") as collect:
list(registry.collect())

collect.assert_called_once()

@pytest.mark.parametrize(
"error, critical_service, errors",
[
(None, True, 0),
(None, False, 0),
(HealthCheckException("test"), True, 1),
(HealthCheckException("test"), False, 1),
(HealthCheckException("test"), True, 1),
(HealthCheckException("test"), False, 1),
],
)
def test_collect(self, collector, plugin, error, critical_service, errors):
plugin.error = error
plugin.critical_service = critical_service

metric_families = list(collector.collect())
assert metric_families

for metric_family in metric_families:
sample = metric_family.samples[0]

assert sample.labels["identifier"] == "FakePlugin"

if critical_service:
assert sample.labels["kind"] == "critical_service"
else:
assert sample.labels["kind"] == "normal"

if sample.name == "django_health_check_errors":
assert sample.value == errors

elif sample.name == "django_health_check_duration_seconds":
assert sample.value is not None