Skip to content

Commit 533b5ce

Browse files
committed
✨(backend) add fallback search & default ordering
Filter deleted documents from visited ones. Set default ordering to the Find API search call (-updated_at) BaseDocumentIndexer.search now returns a list of document ids instead of models. Do not call the indexer in signals when SEARCH_INDEXER_CLASS is not defined or properly configured. Signed-off-by: Fabre Florian <[email protected]>
1 parent 4b6092d commit 533b5ce

File tree

11 files changed

+562
-157
lines changed

11 files changed

+562
-157
lines changed

src/backend/core/api/serializers.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -883,3 +883,7 @@ class FindDocumentSerializer(serializers.Serializer):
883883
"""Serializer for Find search requests"""
884884

885885
q = serializers.CharField(required=True, allow_blank=False, trim_whitespace=True)
886+
page_size = serializers.IntegerField(
887+
required=False, min_value=1, max_value=50, default=20
888+
)
889+
page = serializers.IntegerField(required=False, min_value=1, default=1)

src/backend/core/api/viewsets.py

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from django.contrib.postgres.aggregates import ArrayAgg
1313
from django.contrib.postgres.search import TrigramSimilarity
1414
from django.core.cache import cache
15-
from django.core.exceptions import ImproperlyConfigured, ValidationError
15+
from django.core.exceptions import ValidationError
1616
from django.core.files.storage import default_storage
1717
from django.core.validators import URLValidator
1818
from django.db import connection, transaction
@@ -49,7 +49,10 @@
4949
from core.services.converter_services import (
5050
YdocConverter,
5151
)
52-
from core.services.search_indexers import get_document_indexer_class
52+
from core.services.search_indexers import (
53+
default_document_indexer,
54+
get_visited_document_ids_of,
55+
)
5356
from core.tasks.mail import send_ask_for_access_mail
5457
from core.utils import extract_attachments, filter_descendants
5558

@@ -1042,23 +1045,42 @@ def search(self, request, *args, **kwargs):
10421045
The filtering allows full text search through the opensearch indexation app "find".
10431046
"""
10441047
access_token = request.session.get("oidc_access_token")
1048+
user = request.user
10451049

10461050
serializer = serializers.FindDocumentSerializer(data=request.query_params)
10471051
serializer.is_valid(raise_exception=True)
10481052

1049-
try:
1050-
indexer = get_document_indexer_class()()
1051-
queryset = indexer.search(
1052-
text=serializer.validated_data.get("q", ""),
1053-
user=request.user,
1054-
token=access_token,
1053+
indexer = default_document_indexer()
1054+
1055+
if not indexer:
1056+
queryset = self.get_queryset()
1057+
filterset = DocumentFilter(
1058+
{"title": serializer.validated_data.get("q", "")}, queryset=queryset
10551059
)
1056-
except ImproperlyConfigured:
1057-
return drf.response.Response(
1058-
{"detail": "The service is not properly configured."},
1059-
status=status.HTTP_401_UNAUTHORIZED,
1060+
1061+
if not filterset.is_valid():
1062+
raise drf.exceptions.ValidationError(filterset.errors)
1063+
1064+
queryset = filterset.filter_queryset(queryset).order_by("-updated_at")
1065+
1066+
return self.get_response_for_queryset(
1067+
queryset,
1068+
context={
1069+
"request": request,
1070+
},
10601071
)
10611072

1073+
queryset = models.Document.objects.all()
1074+
results = indexer.search(
1075+
text=serializer.validated_data.get("q", ""),
1076+
token=access_token,
1077+
visited=get_visited_document_ids_of(queryset, user),
1078+
page=serializer.validated_data.get("page", 1),
1079+
page_size=serializer.validated_data.get("page_size", 20),
1080+
)
1081+
1082+
queryset = queryset.filter(pk__in=results)
1083+
10621084
return self.get_response_for_queryset(
10631085
queryset,
10641086
context={

src/backend/core/services/search_indexers.py

Lines changed: 53 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from django.conf import settings
99
from django.contrib.auth.models import AnonymousUser
1010
from django.core.exceptions import ImproperlyConfigured
11+
from django.db.models import Subquery
1112
from django.utils.module_loading import import_string
1213

1314
import requests
@@ -18,7 +19,23 @@
1819

1920

2021
@cache
21-
def get_document_indexer_class() -> "BaseDocumentIndexer":
22+
def default_document_indexer():
23+
"""Returns default indexer service is enabled and properly configured."""
24+
25+
# For this usecase an empty indexer class is not an issue but a feature.
26+
if not getattr(settings, "SEARCH_INDEXER_CLASS", None):
27+
logger.info("Document indexer is not configured (see SEARCH_INDEXER_CLASS)")
28+
return None
29+
30+
try:
31+
return get_document_indexer_class()()
32+
except ImproperlyConfigured as err:
33+
logger.error("Document indexer is not properly configured : %s", err)
34+
return None
35+
36+
37+
@cache
38+
def get_document_indexer_class():
2239
"""Return the indexer backend class based on the settings."""
2340
classpath = settings.SEARCH_INDEXER_CLASS
2441

@@ -65,7 +82,7 @@ def get_batch_accesses_by_users_and_teams(paths):
6582
return dict(access_by_document_path)
6683

6784

68-
def get_visited_document_ids_of(user):
85+
def get_visited_document_ids_of(queryset, user):
6986
"""
7087
Returns the ids of the documents that have a linktrace to the user and NOT owned.
7188
It will be use to limit the opensearch responses to the public documents already
@@ -74,11 +91,18 @@ def get_visited_document_ids_of(user):
7491
if isinstance(user, AnonymousUser):
7592
return []
7693

77-
qs = models.LinkTrace.objects.filter(user=user).exclude(
78-
document__accesses__user=user,
94+
qs = models.LinkTrace.objects.filter(user=user)
95+
96+
docs = (
97+
queryset.exclude(accesses__user=user)
98+
.filter(
99+
deleted_at__isnull=True,
100+
ancestors_deleted_at__isnull=True,
101+
)
102+
.filter(pk__in=Subquery(qs.values("document_id")))
79103
)
80104

81-
return list({str(id) for id in qs.values_list("document_id", flat=True)})
105+
return list({str(id) for id in docs.values_list("pk", flat=True)})
82106

83107

84108
class BaseDocumentIndexer(ABC):
@@ -159,22 +183,41 @@ def push(self, data):
159183
Must be implemented by subclasses.
160184
"""
161185

162-
def search(self, text, user, token):
186+
# pylint: disable-next=too-many-arguments,too-many-positional-arguments
187+
def search(self, text, token, visited=(), page=1, page_size=50):
163188
"""
164189
Search for documents in Find app.
165-
"""
166-
visited_ids = get_visited_document_ids_of(user)
190+
Ensure the same default ordering as "Docs" list : -updated_at
167191
192+
Returns ids of the documents
193+
194+
Args:
195+
text (str): Text search content.
196+
token (str): OIDC Authentication token.
197+
visited (list, optional):
198+
List of ids of active public documents with LinkTrace
199+
Defaults to settings.SEARCH_INDEXER_BATCH_SIZE.
200+
page (int, optional):
201+
The page number to retrieve.
202+
Defaults to 1 if not specified.
203+
page_size (int, optional):
204+
The number of results to return per page.
205+
Defaults to 50 if not specified.
206+
"""
168207
response = self.search_query(
169208
data={
170209
"q": text,
171-
"visited": visited_ids,
210+
"visited": visited,
172211
"services": ["docs"],
212+
"page_number": page,
213+
"page_size": page_size,
214+
"order_by": "updated_at",
215+
"order_direction": "desc",
173216
},
174217
token=token,
175218
)
176219

177-
return self.format_response(response)
220+
return [d["_id"] for d in response]
178221

179222
@abstractmethod
180223
def search_query(self, data, token) -> dict:
@@ -184,14 +227,6 @@ def search_query(self, data, token) -> dict:
184227
Must be implemented by subclasses.
185228
"""
186229

187-
@abstractmethod
188-
def format_response(self, data: dict):
189-
"""
190-
Convert the JSON response from Find app as document queryset.
191-
192-
Must be implemented by subclasses.
193-
"""
194-
195230

196231
class FindDocumentIndexer(BaseDocumentIndexer):
197232
"""
@@ -253,20 +288,13 @@ def search_query(self, data, token) -> requests.Response:
253288
logger.error("HTTPError: %s", e)
254289
raise
255290

256-
def format_response(self, data: dict):
257-
"""
258-
Retrieve documents ids from Find app response and return a queryset.
259-
"""
260-
return models.Document.objects.filter(pk__in=[d["_id"] for d in data])
261-
262291
def push(self, data):
263292
"""
264293
Push a batch of documents to the Find backend.
265294
266295
Args:
267296
data (list): List of document dictionaries.
268297
"""
269-
270298
try:
271299
response = requests.post(
272300
self.indexer_url,

src/backend/core/signals.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,14 @@
22
Declare and configure the signals for the impress core application
33
"""
44

5+
from functools import partial
6+
7+
from django.db import transaction
58
from django.db.models import signals
69
from django.dispatch import receiver
710

811
from . import models
12+
from .services.search_indexers import default_document_indexer
913
from .tasks.find import trigger_document_indexer
1014

1115

@@ -16,13 +20,14 @@ def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-ar
1620
Note : Within the transaction we can have an empty content and a serialization
1721
error.
1822
"""
19-
trigger_document_indexer(instance, on_commit=True)
23+
if default_document_indexer() is not None:
24+
transaction.on_commit(partial(trigger_document_indexer, instance))
2025

2126

2227
@receiver(signals.post_save, sender=models.DocumentAccess)
2328
def document_access_post_save(sender, instance, created, **kwargs): # pylint: disable=unused-argument
2429
"""
2530
Asynchronous call to the document indexer at the end of the transaction.
2631
"""
27-
if not created:
28-
trigger_document_indexer(instance.document, on_commit=True)
32+
if not created and default_document_indexer() is not None:
33+
transaction.on_commit(partial(trigger_document_indexer, instance.document))

src/backend/core/tasks/find.py

Lines changed: 19 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
"""Trigger document indexation using celery task."""
22

3-
from functools import partial
43
from logging import getLogger
54

65
from django.conf import settings
76
from django.core.cache import cache
8-
from django.db import transaction
97

108
from impress.celery_app import app
119

@@ -37,7 +35,7 @@ def decr_counter(key):
3735

3836
@app.task
3937
def document_indexer_task(document_id):
40-
"""Send indexation query for a document using celery task."""
38+
"""Celery Task : Sends indexation query for a document."""
4139
key = document_indexer_debounce_key(document_id)
4240

4341
# check if the counter : if still up, skip the task. only the last one
@@ -46,6 +44,7 @@ def document_indexer_task(document_id):
4644
logger.info("Skip document %s indexation", document_id)
4745
return
4846

47+
# Prevents some circular imports
4948
# pylint: disable=import-outside-toplevel
5049
from core import models # noqa: PLC0415
5150
from core.services.search_indexers import ( # noqa: PLC0415
@@ -63,35 +62,27 @@ def document_indexer_task(document_id):
6362
indexer.push(data)
6463

6564

66-
def trigger_document_indexer(document, on_commit=False):
65+
def trigger_document_indexer(document):
6766
"""
6867
Trigger indexation task with debounce a delay set by the SEARCH_INDEXER_COUNTDOWN setting.
6968
7069
Args:
7170
document (Document): The document instance.
72-
on_commit (bool): Wait for the end of the transaction before starting the task
73-
(some fields may be in wrong state within the transaction)
7471
"""
75-
7672
if document.deleted_at or document.ancestors_deleted_at:
77-
pass
78-
79-
if on_commit:
80-
transaction.on_commit(
81-
partial(trigger_document_indexer, document, on_commit=False)
82-
)
83-
else:
84-
key = document_indexer_debounce_key(document.pk)
85-
countdown = getattr(settings, "SEARCH_INDEXER_COUNTDOWN", 1)
86-
87-
logger.info(
88-
"Add task for document %s indexation in %.2f seconds",
89-
document.pk,
90-
countdown,
91-
)
92-
93-
# Each time this method is called during the countdown, we increment the
94-
# counter and each task decrease it, so the index be run only once.
95-
incr_counter(key)
96-
97-
document_indexer_task.apply_async(args=[document.pk], countdown=countdown)
73+
return
74+
75+
key = document_indexer_debounce_key(document.pk)
76+
countdown = getattr(settings, "SEARCH_INDEXER_COUNTDOWN", 1)
77+
78+
logger.info(
79+
"Add task for document %s indexation in %.2f seconds",
80+
document.pk,
81+
countdown,
82+
)
83+
84+
# Each time this method is called during the countdown, we increment the
85+
# counter and each task decrease it, so the index be run only once.
86+
incr_counter(key)
87+
88+
document_indexer_task.apply_async(args=[document.pk], countdown=countdown)

src/backend/core/tests/commands/test_index.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616

1717
@pytest.mark.django_db
18+
@pytest.mark.usefixtures("indexer_settings")
1819
def test_index():
1920
"""Test the command `index` that run the Find app indexer for all the available documents."""
2021
user = factories.UserFactory()

src/backend/core/tests/conftest.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,32 @@ def mock_user_teams():
2424
"core.models.User.teams", new_callable=mock.PropertyMock
2525
) as mock_teams:
2626
yield mock_teams
27+
28+
29+
@pytest.fixture(name="indexer_settings")
30+
def indexer_settings_fixture(settings):
31+
"""
32+
Setup valid settings for the document indexer. Clear the indexer cache.
33+
"""
34+
35+
# pylint: disable-next=import-outside-toplevel
36+
from core.services.search_indexers import ( # noqa: PLC0415
37+
default_document_indexer,
38+
get_document_indexer_class,
39+
)
40+
41+
default_document_indexer.cache_clear()
42+
get_document_indexer_class.cache_clear()
43+
44+
settings.SEARCH_INDEXER_CLASS = "core.services.search_indexers.FindDocumentIndexer"
45+
settings.SEARCH_INDEXER_SECRET = "ThisIsAKeyForTest"
46+
settings.SEARCH_INDEXER_URL = "http://localhost:8081/api/v1.0/documents/index/"
47+
settings.SEARCH_INDEXER_QUERY_URL = (
48+
"http://localhost:8081/api/v1.0/documents/search/"
49+
)
50+
51+
yield settings
52+
53+
# clear cache to prevent issues with other tests
54+
default_document_indexer.cache_clear()
55+
get_document_indexer_class.cache_clear()

0 commit comments

Comments
 (0)