Skip to content

Commit

Permalink
limit max length of text sent to elasticsearch
Browse files Browse the repository at this point in the history
closes #1865
  • Loading branch information
longhotsummer committed Jul 8, 2024
1 parent e64e959 commit 05b80e9
Showing 1 changed file with 16 additions and 1 deletion.
17 changes: 16 additions & 1 deletion peachjam_search/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,10 @@ class SearchableDocument(Document):
("nature", "name"),
]

# ES's max request size is 100mb, so limit the size of the text fields to a little below that
# 80 MB
MAX_TEXT_LENGTH = 80 * 1024 * 1024

def should_index_object(self, obj):
if isinstance(obj, ExternalDocument) or not obj.published:
return False
Expand Down Expand Up @@ -233,7 +237,13 @@ def prepare_content(self, instance):
if instance.content_html and (
not instance.content_html_is_akn or not instance.toc_json
):
return instance.get_content_as_text()
text = instance.get_content_as_text()
if text and len(text) > self.MAX_TEXT_LENGTH:
log.warning(
f"Limiting text content of {instance} to {self.MAX_TEXT_LENGTH} (length is {len(text)})"
)
text = text[: self.MAX_TEXT_LENGTH]
return text

def prepare_ranking(self, instance):
if instance.work.ranking > 0:
Expand Down Expand Up @@ -272,6 +282,11 @@ def prepare_pages(self, instance):
"""Text content of pages extracted from PDF."""
if not instance.content_html:
text = instance.get_content_as_text()
if text and len(text) > self.MAX_TEXT_LENGTH:
log.warning(
f"Limiting text content of {instance} to {self.MAX_TEXT_LENGTH} (length is {len(text)})"
)
text = text[: self.MAX_TEXT_LENGTH]
page_texts = text.split("\x0c")
pages = []
for i, page in enumerate(page_texts):
Expand Down

0 comments on commit 05b80e9

Please sign in to comment.