From 27dfb5d4fab2b1a6f7fb38c686515bf448983de8 Mon Sep 17 00:00:00 2001 From: Ferran Llamas Date: Fri, 27 Dec 2024 12:24:22 +0100 Subject: [PATCH 1/2] Do not compute global text&tags twice on reindex --- .../src/nucliadb/ingest/orm/processor/processor.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/nucliadb/src/nucliadb/ingest/orm/processor/processor.py b/nucliadb/src/nucliadb/ingest/orm/processor/processor.py index caa143dcf7..e66a86a540 100644 --- a/nucliadb/src/nucliadb/ingest/orm/processor/processor.py +++ b/nucliadb/src/nucliadb/ingest/orm/processor/processor.py @@ -313,14 +313,22 @@ async def txn( await self.apply_resource(message, resource, update=(not created)) # index message - if resource: await resource.compute_global_text() await resource.compute_global_tags(resource.indexer) await resource.compute_security(resource.indexer) if message.reindex: # when reindexing, let's just generate full new index message + # TODO - This should be improved in the future as it's not optimal for very large resources: + # As of now, there are some API operations that require fully reindexing all the fields of a resource. + # An example of this is classification label changes - we need to reindex all the fields of a resource to + # propagate the label changes to the index. resource.replace_indexer(await resource.generate_index_message(reindex=True)) + else: + # TODO - Ideally we should only update the fields that have been changed in the current transaction. + await resource.compute_global_text() + await resource.compute_global_tags(resource.indexer) + await resource.compute_security(resource.indexer) if resource and resource.modified: await pgcatalog_update(txn, kbid, resource) From ea66b10342fe627d0d468a4f08e0f0b82761182d Mon Sep 17 00:00:00 2001 From: Ferran Llamas Date: Fri, 27 Dec 2024 12:24:40 +0100 Subject: [PATCH 2/2] Do not compute global text&tags twice on reindex --- nucliadb/src/nucliadb/ingest/orm/processor/processor.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/nucliadb/src/nucliadb/ingest/orm/processor/processor.py b/nucliadb/src/nucliadb/ingest/orm/processor/processor.py index e66a86a540..4cabe44627 100644 --- a/nucliadb/src/nucliadb/ingest/orm/processor/processor.py +++ b/nucliadb/src/nucliadb/ingest/orm/processor/processor.py @@ -314,9 +314,6 @@ async def txn( # index message if resource: - await resource.compute_global_text() - await resource.compute_global_tags(resource.indexer) - await resource.compute_security(resource.indexer) if message.reindex: # when reindexing, let's just generate full new index message # TODO - This should be improved in the future as it's not optimal for very large resources: