alephdata · tillprochaska · Jan 16, 2024 · Oct 16, 2023 · Sep 21, 2023 · Oct 16, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -46,5 +46,7 @@ ENV ALEPH_ELASTICSEARCH_URI=http://elasticsearch:9200/ \
     FTM_COMPARE_FREQUENCIES_DIR=/opt/ftm-compare/word-frequencies/ \
     FTM_COMPARE_MODEL=/opt/ftm-compare/model.pkl
 
+RUN mkdir /run/prometheus
+
 # Run the green unicorn
-CMD gunicorn -w 5 -b 0.0.0.0:8000 --log-level info --log-file - aleph.manage:app
+CMD gunicorn --config /aleph/gunicorn.conf.py --workers 5 --log-level info --log-file - aleph.manage:app
diff --git a/aleph/core.py b/aleph/core.py
@@ -26,6 +26,7 @@
 from aleph.cache import Cache
 from aleph.oauth import configure_oauth
 from aleph.util import LoggingTransport
+from aleph.metrics.flask import PrometheusExtension
 
 import sentry_sdk
 from sentry_sdk.integrations.flask import FlaskIntegration
@@ -39,6 +40,7 @@
 mail = Mail()
 babel = Babel()
 talisman = Talisman()
+prometheus = PrometheusExtension()
 
 
 def determine_locale():
@@ -94,6 +96,7 @@ def create_app(config=None):
     mail.init_app(app)
     db.init_app(app)
     babel.init_app(app, locale_selector=determine_locale)
+    prometheus.init_app(app)
     CORS(
         app,
         resources=r"/api/*",

diff --git a/aleph/logic/xref.py b/aleph/logic/xref.py
@@ -4,6 +4,7 @@
 from pprint import pformat, pprint  # noqa
 from tempfile import mkdtemp
 from dataclasses import dataclass
+from timeit import default_timer
 
 import followthemoney
 from followthemoney import model
@@ -15,6 +16,7 @@
 from followthemoney_compare.models import GLMBernoulli2EEvaluator
 from followthemoney.proxy import EntityProxy
 from servicelayer.archive.util import ensure_path
+from prometheus_client import Counter, Histogram
 
 from aleph.core import es, db
 from aleph.settings import SETTINGS
@@ -40,6 +42,36 @@
 ORIGIN = "xref"
 MODEL = None
 FTM_VERSION_STR = f"ftm-{followthemoney.__version__}"
+SCORE_CUTOFF = 0.5
+
+XREF_ENTITIES = Counter(
+    "aleph_xref_entities_total",
+    "Total number of entities and mentions that have been xref'ed",
+)
+
+XREF_MATCHES = Histogram(
+    "aleph_xref_matches",
+    "Number of matches per xref'ed entitiy or mention",
+    buckets=[
+        # Listing 0 as a separate bucket size because it's interesting to know
+        # what percentage of entities result in no matches at all
+        0,
+        5,
+        10,
+        25,
+        50,
+    ],
+)
+
+XREF_CANDIDATES_QUERY_DURATION = Histogram(
+    "aleph_xref_candidates_query_duration_seconds",
+    "Processing duration of the candidates query (excl. network, serialization etc.)",
+)
+
+XREF_CANDIDATES_QUERY_ROUNDTRIP_DURATION = Histogram(
+    "aleph_xref_candidates_query_roundtrip_duration_seconds",
+    "Roundtrip duration of the candidates query (incl. network, serialization etc.)",
+)
 
 
 @dataclass
@@ -102,7 +134,15 @@ def _query_item(entity, entitysets=True):
     query = {"query": query, "size": 50, "_source": ENTITY_SOURCE}
     schemata = list(entity.schema.matchable_schemata)
     index = entities_read_index(schema=schemata, expand=False)
+
+    start_time = default_timer()
     result = es.search(index=index, body=query)
+    roundtrip_duration = max(0, default_timer() - start_time)
+    query_duration = result.get("took")
+    if query_duration is not None:
+        # ES returns milliseconds, but we track query time in seconds
+        query_duration = result.get("took") / 1000
+
     candidates = []
     for result in result.get("hits").get("hits"):
         result = unpack_result(result)
@@ -116,7 +156,9 @@ def _query_item(entity, entitysets=True):
         entity.caption,
         len(candidates),
     )
+
     results = _bulk_compare([(entity, c) for c in candidates])
+    match_count = 0
     for match, (score, doubt, method) in zip(candidates, results):
         log.debug(
             "Match: %s: %s <[%.2f]@%0.2f> %s",
@@ -136,6 +178,17 @@ def _query_item(entity, entitysets=True):
                 match=match,
                 entityset_ids=entityset_ids,
             )
+        if score > SCORE_CUTOFF:
+            # While we store all xref matches with a score > 0, we only count matches
+            # with a score above a threshold. This is in line with the user-facing behavior
+            # which also only shows matches above the threshold.
+            match_count += 1
+
+    XREF_ENTITIES.inc()
+    XREF_MATCHES.observe(match_count)
+    XREF_CANDIDATES_QUERY_ROUNDTRIP_DURATION.observe(roundtrip_duration)
+    if query_duration:
+        XREF_CANDIDATES_QUERY_DURATION.observe(query_duration)
 
 
 def _iter_mentions(collection):

diff --git a/aleph/metrics/__init__.py b/aleph/metrics/__init__.py
diff --git a/aleph/metrics/collectors.py b/aleph/metrics/collectors.py
@@ -0,0 +1,178 @@
+from sqlalchemy import func
+from prometheus_client.core import GaugeMetricFamily, InfoMetricFamily
+from prometheus_client.registry import Collector
+from followthemoney import __version__ as ftm_version
+
+from aleph import __version__ as aleph_version
+from aleph.core import create_app as create_flask_app
+from aleph.queues import get_active_dataset_status
+from aleph.model import Role, Collection, EntitySet, Bookmark
+
+
+class InfoCollector(Collector):
+    def collect(self):
+        yield InfoMetricFamily(
+            "aleph_system",
+            "Aleph system information",
+            value={
+                "aleph_version": aleph_version,
+                "ftm_version": ftm_version,
+            },
+        )
+
+
+class DatabaseCollector(Collector):
+    def __init__(self):
+        self._flask_app = create_flask_app()
+
+    def collect(self):
+        with self._flask_app.app_context():
+            yield self._users()
+            yield self._collections()
+            yield self._collection_users()
+            yield self._entitysets()
+            yield self._entityset_users()
+            yield self._bookmarks()
+            yield self._bookmark_users()
+
+    def _users(self):
+        return GaugeMetricFamily(
+            "aleph_users",
+            "Total number of users",
+            value=Role.all_users().count(),
+        )
+
+    def _collections(self):
+        gauge = GaugeMetricFamily(
+            "aleph_collections",
+            "Total number of collections by category",
+            labels=["category"],
+        )
+
+        query = (
+            Collection.all()
+            .with_entities(Collection.category, func.count())
+            .group_by(Collection.category)
+        )
+
+        for category, count in query:
+            gauge.add_metric([category], count)
+
+        return gauge
+
+    def _collection_users(self):
+        gauge = GaugeMetricFamily(
+            "aleph_collection_users",
+            "Total number of users that have created at least one collection",
+            labels=["category"],
+        )
+
+        query = (
+            Collection.all()
+            .with_entities(
+                Collection.category,
+                func.count(func.distinct(Collection.creator_id)),
+            )
+            .group_by(Collection.category)
+        )
+
+        for category, count in query:
+            gauge.add_metric([category], count)
+
+        return gauge
+
+    def _entitysets(self):
+        gauge = GaugeMetricFamily(
+            "aleph_entitysets",
+            "Total number of entity set by type",
+            labels=["type"],
+        )
+
+        query = (
+            EntitySet.all()
+            .with_entities(EntitySet.type, func.count())
+            .group_by(EntitySet.type)
+        )
+
+        for entityset_type, count in query:
+            gauge.add_metric([entityset_type], count)
+
+        return gauge
+
+    def _entityset_users(self):
+        gauge = GaugeMetricFamily(
+            "aleph_entityset_users",
+            "Number of users that have created at least on entity set of the given type",
+            labels=["type"],
+        )
+
+        query = (
+            EntitySet.all()
+            .with_entities(
+                EntitySet.type,
+                func.count(func.distinct(EntitySet.role_id)),
+            )
+            .group_by(EntitySet.type)
+        )
+
+        for entityset_type, count in query:
+            gauge.add_metric([entityset_type], count)
+
+        return gauge
+
+    def _bookmarks(self):
+        return GaugeMetricFamily(
+            "aleph_bookmarks",
+            "Total number of bookmarks",
+            value=Bookmark.query.count(),
+        )
+
+    def _bookmark_users(self):
+        return GaugeMetricFamily(
+            "aleph_bookmark_users",
+            "Number of users that have created at least one bookmark",
+            value=Bookmark.query.distinct(Bookmark.role_id).count(),
+        )
+
+
+class QueuesCollector(Collector):
+    def collect(self):
+        status = get_active_dataset_status()
+
+        yield GaugeMetricFamily(
+            "aleph_active_datasets",
+            "Total number of active datasets",
+            value=status["total"],
+        )
+
+        stages = {}
+
+        for collection_status in status["datasets"].values():
+            for job_status in collection_status["jobs"]:
+                for stage_status in job_status["stages"]:
+                    stage = stage_status["stage"]
+                    pending = stage_status["pending"]
+                    running = stage_status["running"]
+
+                    if stage not in stages:
+                        stages[stage] = {
+                            "pending": 0,
+                            "running": 0,
+                        }
+
+                    stages[stage] = {
+                        "pending": stages[stage].get("pending") + pending,
+                        "running": stages[stage].get("running") + running,
+                    }
+
+        tasks_gauge = GaugeMetricFamily(
+            "aleph_tasks",
+            "Total number of pending or running tasks in a given stage",
+            labels=["stage", "status"],
+        )
+
+        for stage, tasks in stages.items():
+            tasks_gauge.add_metric([stage, "pending"], tasks["pending"])
+            tasks_gauge.add_metric([stage, "running"], tasks["running"])
+
+        yield tasks_gauge
diff --git a/aleph/metrics/exporter.py b/aleph/metrics/exporter.py
@@ -0,0 +1,17 @@
+from prometheus_client import make_wsgi_app, PLATFORM_COLLECTOR
+from prometheus_client.core import CollectorRegistry
+
+from aleph.metrics.collectors import InfoCollector, DatabaseCollector, QueuesCollector
+
+
+def create_app():
+    registry = CollectorRegistry()
+    registry.register(PLATFORM_COLLECTOR)
+    registry.register(InfoCollector())
+    registry.register(DatabaseCollector())
+    registry.register(QueuesCollector())
+
+    return make_wsgi_app(registry=registry)
+
+
+app = create_app()