Skip to content

Commit 2d19a4a

Browse files
committed
✨(backend) Index partially empty documents
Only documents without title and content are ignored by indexer.
1 parent 533b5ce commit 2d19a4a

File tree

3 files changed

+45
-8
lines changed

3 files changed

+45
-8
lines changed

src/backend/core/management/commands/index.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,11 @@ def handle(self, *args, **options):
2121
"""Launch and log search index generation."""
2222
logger.info("Starting to regenerate Find index...")
2323
start = time.perf_counter()
24-
25-
FindDocumentIndexer().index()
24+
count = FindDocumentIndexer().index()
2625

2726
duration = time.perf_counter() - start
28-
logger.info("Search index regenerated in %.2f seconds.", duration)
27+
logger.info(
28+
"Search index regenerated from %d document(s) in %.2f seconds.",
29+
count,
30+
duration,
31+
)

src/backend/core/services/search_indexers.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,8 @@ def index(self):
146146
Fetch documents in batches, serialize them, and push to the search backend.
147147
"""
148148
last_id = 0
149+
count = 0
150+
149151
while True:
150152
documents_batch = list(
151153
models.Document.objects.filter(
@@ -163,9 +165,13 @@ def index(self):
163165
serialized_batch = [
164166
self.serialize_document(document, accesses_by_document_path)
165167
for document in documents_batch
166-
if document.content
168+
if document.content or document.title
167169
]
170+
168171
self.push(serialized_batch)
172+
count += len(serialized_batch)
173+
174+
return count
169175

170176
@abstractmethod
171177
def serialize_document(self, document, accesses):

src/backend/core/tests/test_services_search_indexers.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,7 @@ def test_services_search_indexers_batches_pass_only_batch_accesses(
304304
access = factories.UserDocumentAccessFactory(document=document)
305305
expected_user_subs[str(document.id)] = str(access.user.sub)
306306

307-
FindDocumentIndexer().index()
307+
assert FindDocumentIndexer().index() == 5
308308

309309
# Should be 3 batches: 2 + 2 + 1
310310
assert mock_push.call_count == 3
@@ -327,6 +327,34 @@ def test_services_search_indexers_batches_pass_only_batch_accesses(
327327
assert seen_doc_ids == {str(d.id) for d in documents}
328328

329329

330+
@patch.object(FindDocumentIndexer, "push")
331+
@pytest.mark.usefixtures("indexer_settings")
332+
def test_services_search_indexers_ignore_empty_documents(mock_push):
333+
"""
334+
Documents indexing should be processed in batches,
335+
and only the access data relevant to each batch should be used.
336+
"""
337+
document = factories.DocumentFactory()
338+
factories.DocumentFactory(content="", title="")
339+
empty_title = factories.DocumentFactory(title="")
340+
empty_content = factories.DocumentFactory(content="")
341+
342+
assert FindDocumentIndexer().index() == 3
343+
344+
assert mock_push.call_count == 1
345+
346+
# Make sure only not eempty documents are indexed
347+
results = {doc["id"] for doc in mock_push.call_args[0][0]}
348+
assert results == {
349+
str(d.id)
350+
for d in (
351+
document,
352+
empty_content,
353+
empty_title,
354+
)
355+
}
356+
357+
330358
@patch.object(FindDocumentIndexer, "push")
331359
@pytest.mark.usefixtures("indexer_settings")
332360
def test_services_search_indexers_ancestors_link_reach(mock_push):
@@ -338,7 +366,7 @@ def test_services_search_indexers_ancestors_link_reach(mock_push):
338366
parent = factories.DocumentFactory(parent=grand_parent, link_reach="public")
339367
document = factories.DocumentFactory(parent=parent, link_reach="restricted")
340368

341-
FindDocumentIndexer().index()
369+
assert FindDocumentIndexer().index() == 4
342370

343371
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
344372
assert len(results) == 4
@@ -358,7 +386,7 @@ def test_services_search_indexers_ancestors_users(mock_push):
358386
parent = factories.DocumentFactory(parent=grand_parent, users=[user_p])
359387
document = factories.DocumentFactory(parent=parent, users=[user_d])
360388

361-
FindDocumentIndexer().index()
389+
assert FindDocumentIndexer().index() == 3
362390

363391
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
364392
assert len(results) == 3
@@ -379,7 +407,7 @@ def test_services_search_indexers_ancestors_teams(mock_push):
379407
parent = factories.DocumentFactory(parent=grand_parent, teams=["team_p"])
380408
document = factories.DocumentFactory(parent=parent, teams=["team_d"])
381409

382-
FindDocumentIndexer().index()
410+
assert FindDocumentIndexer().index() == 3
383411

384412
results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
385413
assert len(results) == 3

0 commit comments

Comments
 (0)