From f19d49a9f832b91c4317c163fb82ec3f78f4edee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 13 Aug 2024 17:08:36 +0200 Subject: [PATCH] fix: improve question count query performance with vote The query was really slow, as we needed a seqscan to know the exact count of insights following the criteria, including the vote criterion. The count could be really high (ex: on https://hunger.openfoodfacts.org/questions we have 1.5M insights). By limiting to 100, we dramatically reduce query performance. --- robotoff/app/api.py | 8 +++++++- robotoff/app/core.py | 7 +++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/robotoff/app/api.py b/robotoff/app/api.py index ea3531c0a9..28c9a62ca2 100644 --- a/robotoff/app/api.py +++ b/robotoff/app/api.py @@ -1429,6 +1429,11 @@ def get_questions_resource_on_get( # Limit the number of brands to prevent slow SQL queries brands = brands[:10] + avoid_voted_on = _get_skip_voted_on(auth, device_id) + # Counting the number of insights that match the vote + # criteria can be very costly, so we limit the count to 100 + # if avoid_voted_on is not None + max_count = 100 if avoid_voted_on is not None else None get_insights_ = functools.partial( get_insights, server_type=server_type, @@ -1438,7 +1443,8 @@ def get_questions_resource_on_get( brands=brands, order_by=order_by, reserved_barcode=reserved_barcode, - avoid_voted_on=_get_skip_voted_on(auth, device_id), + avoid_voted_on=avoid_voted_on, + max_count=max_count, automatically_processable=False, campaigns=campaigns, predictor=predictor, diff --git a/robotoff/app/core.py b/robotoff/app/core.py index 6db13067fd..344aff94e6 100644 --- a/robotoff/app/core.py +++ b/robotoff/app/core.py @@ -79,6 +79,7 @@ def get_insights( limit: Optional[int] = 25, offset: Optional[int] = None, count: bool = False, + max_count: Optional[int] = None, avoid_voted_on: Optional[SkipVotedOn] = None, group_by_value_tag: Optional[bool] = False, automatically_processable: Optional[bool] = None, @@ -116,6 +117,10 @@ def get_insights( :param offset: query offset (used for pagination), defaults to None :param count: if True, return the number of results instead of the results, defaults to False + :param count_max: an upper bound on the number of insights to count, + defaults to None. If provided, the count will be limited to this + value. It allows to dramatically speed up the count query. + If not provided, an exact count will be returned. :param avoid_voted_on: a SkipVotedOn used to remove results insights the user previously ignored, defaults to None :param group_by_value_tag: if True, group results by value_tag, defaults @@ -181,6 +186,8 @@ def get_insights( query = query.where(*where_clauses) if count: + if max_count is not None: + query = query.limit(max_count) return query.count() if limit is not None: