From d1fb461347b28a570b52a390464f0b03b42ac5a4 Mon Sep 17 00:00:00 2001 From: preemoDez Date: Thu, 15 Feb 2024 13:14:36 -0800 Subject: [PATCH 1/2] Fix the perf issue in building nodes from splits. Create the `relationships` object only once. Otherwise, it recomputes the whole text's hash for every node. It is very inefficient for long text. An alternative approach would be to cache the hash property. However, it wasn't so straightforward as `Document` isn't a cacheable type. I also do not know Python very well, maybe it would be enough to store a simple null and if it isn't null, then don't recompute? However, the most important reason is I'm not sure about the side effects and the existing assumption that the node is mutable and the hash always reflects the state during the call (unless we modify the object in multiple threads). This change doesn't break any assumptions. If the document was modified while we were creating nodes extracted from it, something would be very wrong. Benchmarks taken on a document attached to the bug: Before: Execution time for build_nodes_from_splits: 53.69 seconds After: Execution time for build_nodes_from_splits: 0.18 seconds --- .../llama_index/core/node_parser/node_utils.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/llama-index-core/llama_index/core/node_parser/node_utils.py b/llama-index-core/llama_index/core/node_parser/node_utils.py index b05d08223b6f5..a8c0fc8835a21 100644 --- a/llama-index-core/llama_index/core/node_parser/node_utils.py +++ b/llama-index-core/llama_index/core/node_parser/node_utils.py @@ -38,6 +38,12 @@ def build_nodes_from_splits( ref_doc = ref_doc or document id_func = id_func or default_id_func nodes: List[TextNode] = [] + """Calling as_related_node_info() on a document recomputes the hash for the whole text and metadata""" + """It is not that bad, when creating relationships between the nodes, but is terrible when adding a relationship""" + """between the node and a document, hence we create the relationship only once here and pass it to the nodes""" + relationships = { + NodeRelationship.SOURCE: ref_doc.as_related_node_info() + } for i, text_chunk in enumerate(text_splits): logger.debug(f"> Adding chunk: {truncate_text(text_chunk, 50)}") @@ -54,7 +60,7 @@ def build_nodes_from_splits( metadata_seperator=document.metadata_seperator, metadata_template=document.metadata_template, text_template=document.text_template, - relationships={NodeRelationship.SOURCE: ref_doc.as_related_node_info()}, + relationships=relationships, ) nodes.append(image_node) # type: ignore elif isinstance(document, Document): @@ -67,7 +73,7 @@ def build_nodes_from_splits( metadata_seperator=document.metadata_seperator, metadata_template=document.metadata_template, text_template=document.text_template, - relationships={NodeRelationship.SOURCE: ref_doc.as_related_node_info()}, + relationships=relationships, ) nodes.append(node) elif isinstance(document, TextNode): @@ -80,7 +86,7 @@ def build_nodes_from_splits( metadata_seperator=document.metadata_seperator, metadata_template=document.metadata_template, text_template=document.text_template, - relationships={NodeRelationship.SOURCE: ref_doc.as_related_node_info()}, + relationships=relationships, ) nodes.append(node) else: From 7b406a2a787044d68cea14ddad970631800f8b43 Mon Sep 17 00:00:00 2001 From: preemoDez Date: Thu, 15 Feb 2024 13:50:32 -0800 Subject: [PATCH 2/2] Fix the formatting --- llama-index-core/llama_index/core/node_parser/node_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llama-index-core/llama_index/core/node_parser/node_utils.py b/llama-index-core/llama_index/core/node_parser/node_utils.py index a8c0fc8835a21..a8c39cd49ec06 100644 --- a/llama-index-core/llama_index/core/node_parser/node_utils.py +++ b/llama-index-core/llama_index/core/node_parser/node_utils.py @@ -41,9 +41,7 @@ def build_nodes_from_splits( """Calling as_related_node_info() on a document recomputes the hash for the whole text and metadata""" """It is not that bad, when creating relationships between the nodes, but is terrible when adding a relationship""" """between the node and a document, hence we create the relationship only once here and pass it to the nodes""" - relationships = { - NodeRelationship.SOURCE: ref_doc.as_related_node_info() - } + relationships = {NodeRelationship.SOURCE: ref_doc.as_related_node_info()} for i, text_chunk in enumerate(text_splits): logger.debug(f"> Adding chunk: {truncate_text(text_chunk, 50)}")