Merge pull request #1896 from laws-africa/content-whitespace

Clean content_html and handle when it's empty
laws-africa · Jul 8, 2024 · 959ce8b · 959ce8b
2 parents 1aaa29f + b2c698e
commit 959ce8b
Show file tree

Hide file tree

Showing 6 changed files with 98 additions and 6 deletions.
diff --git a/peachjam/admin.py b/peachjam/admin.py
@@ -323,7 +323,8 @@ def clean_content_html(self):
         # prevent CKEditor-based editing of AKN HTML
         if self.instance.content_html_is_akn:
             return self.instance.content_html
-        return self.cleaned_data["content_html"]
+        # ensure html is clean
+        return self.instance.clean_content_html(self.cleaned_data["content_html"])
 
 
 class AttachedFilesInline(BaseAttachmentFileInline):

diff --git a/peachjam/analysis/citations.py b/peachjam/analysis/citations.py
@@ -5,8 +5,10 @@
 import requests
 from django.conf import settings
 from docpipe.matchers import ExtractedCitation
+from lxml.etree import ParseError
 
 from peachjam.models import CitationLink
+from peachjam.xmlutils import parse_html_str
 
 log = logging.getLogger(__name__)
 
@@ -28,7 +30,15 @@ def extract_citations(self, document):
             return self.extract_citations_from_source_file(document)
 
     def extract_citations_from_html(self, document):
-        html = lxml.html.fromstring(document.content_html)
+        try:
+            html = parse_html_str(document.content_html)
+        except ParseError as e:
+            log.warning(
+                f"Could not parse HTML for document {document.expression_uri()}: {document.content_html}",
+                exc_info=e,
+            )
+            return False
+
         for matcher in self.matchers:
             # the matcher can either manipulate the html, or return a new tree
             res = matcher().markup_html_matches(document.expression_uri(), html)

diff --git a/peachjam/migrations/0146_clean_content_html.py b/peachjam/migrations/0146_clean_content_html.py
@@ -0,0 +1,53 @@
+# Generated by Django 3.2.25 on 2024-07-08 10:08
+import re
+
+from django.db import migrations
+from django.db.models.functions import Length
+from lxml.etree import ParserError
+
+from peachjam.xmlutils import parse_html_str
+
+
+def clean_content_html(content_html):
+    """Ensure that content_html is not just whitespace HTML. Returns the cleaned value."""
+    if not content_html:
+        return None
+
+    # return None if the HTML doesn't have any content
+    try:
+        root = parse_html_str(content_html)
+        text = "".join(root.itertext()).strip()
+        text = re.sub(r"\s", "", text)
+        if not text:
+            return None
+    except (ValueError, ParserError):
+        return None
+
+    return content_html
+
+
+def forwards(apps, schema_editor):
+    CoreDocument = apps.get_model("peachjam", "CoreDocument")
+    # get docs where the length of content_html is > 0 but less than 500
+    qs = CoreDocument.objects.annotate(text_length=Length("content_html")).filter(
+        content_html_is_akn=False,
+        content_html__isnull=False,
+        text_length__gt=0,
+        text_length__lt=1000,
+    )
+    for doc in qs.iterator(100):
+        html = doc.content_html
+        doc.content_html = clean_content_html(doc.content_html)
+        if doc.content_html != html:
+            doc.save(update_fields=["content_html"])
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("peachjam", "0145_alter_courtregistry_options"),
+    ]
+
+    operations = [
+        migrations.RunPython(forwards, migrations.RunPython.noop),
+    ]
diff --git a/peachjam/models/core_document_model.py b/peachjam/models/core_document_model.py
@@ -23,6 +23,7 @@
 from docpipe.xmlutils import unwrap_element
 from languages_plus.models import Language
 from lxml import html
+from lxml.etree import ParserError
 from polymorphic.managers import PolymorphicManager
 from polymorphic.models import PolymorphicModel
 from polymorphic.query import PolymorphicQuerySet
@@ -479,6 +480,23 @@ def full_clean(self, *args, **kwargs):
         self.pre_save()
         super().full_clean(*args, **kwargs)
 
+    def clean_content_html(self, content_html):
+        """Ensure that content_html is not just whitespace HTML. Returns the cleaned value."""
+        if not content_html:
+            return None
+
+        # return None if the HTML doesn't have any content
+        try:
+            root = parse_html_str(content_html)
+            text = "".join(root.itertext()).strip()
+            text = re.sub(r"\s", "", text)
+            if not text:
+                return None
+        except (ValueError, ParserError):
+            return None
+
+        return content_html
+
     def clean(self):
         super().clean()
 
@@ -590,7 +608,7 @@ def delete_citations(self):
 
         if self.content_html and not self.content_html_is_akn:
             # delete existing citations in html
-            root = html.fromstring(self.content_html)
+            root = parse_html_str(self.content_html)
             deleted = False
             for a in root.xpath('//a[starts-with(@href, "/akn/")]'):
                 unwrap_element(a)
@@ -612,7 +630,7 @@ def extract_content_from_source_file(self):
             context = PipelineContext(word_pipeline)
             context.source_file = self.source_file.file
             word_pipeline(context)
-            self.content_html = context.html_text
+            self.content_html = self.clean_content_html(context.html_text)
 
             for img in self.images.all():
                 img.delete()

diff --git a/peachjam/models/journals_books.py b/peachjam/models/journals_books.py
@@ -13,10 +13,11 @@ def delete_citations(self):
         super().delete_citations()
         # reset the HTML back to the original from markdown, because delete_citations()
         # removes any embedded akn links
-        self.convert_content_markdown()
+        if self.content_markdown:
+            self.convert_content_markdown()
 
     def convert_content_markdown(self):
-        self.content_html = markdownify(self.content_markdown)
+        self.content_html = markdownify(self.content_markdown or "")
 
     def pre_save(self):
         self.frbr_uri_doctype = "doc"

diff --git a/peachjam/tests/test_documents.py b/peachjam/tests/test_documents.py
@@ -107,3 +107,12 @@ def test_journal(self):
         journal.save()
         self.assertEqual("doc", journal.frbr_uri_doctype)
         self.assertEqual("journal", journal.frbr_uri_subtype)
+
+    def test_clean_content_html(self):
+        doc = CoreDocument()
+        self.assertIsNone(doc.clean_content_html(""""""))
+        self.assertIsNone(doc.clean_content_html("""<aoeu"""))
+        self.assertIsNone(doc.clean_content_html("""<div>   \n&nbsp;  \n</div>"""))
+        self.assertEqual(
+            doc.clean_content_html("""<div>test</div>"""), """<div>test</div>"""
+        )