Skip to content

Commit

Permalink
Merge pull request #1896 from laws-africa/content-whitespace
Browse files Browse the repository at this point in the history
Clean content_html and handle when it's empty
  • Loading branch information
longhotsummer authored Jul 8, 2024
2 parents 1aaa29f + b2c698e commit 959ce8b
Show file tree
Hide file tree
Showing 6 changed files with 98 additions and 6 deletions.
3 changes: 2 additions & 1 deletion peachjam/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,8 @@ def clean_content_html(self):
# prevent CKEditor-based editing of AKN HTML
if self.instance.content_html_is_akn:
return self.instance.content_html
return self.cleaned_data["content_html"]
# ensure html is clean
return self.instance.clean_content_html(self.cleaned_data["content_html"])


class AttachedFilesInline(BaseAttachmentFileInline):
Expand Down
12 changes: 11 additions & 1 deletion peachjam/analysis/citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
import requests
from django.conf import settings
from docpipe.matchers import ExtractedCitation
from lxml.etree import ParseError

from peachjam.models import CitationLink
from peachjam.xmlutils import parse_html_str

log = logging.getLogger(__name__)

Expand All @@ -28,7 +30,15 @@ def extract_citations(self, document):
return self.extract_citations_from_source_file(document)

def extract_citations_from_html(self, document):
html = lxml.html.fromstring(document.content_html)
try:
html = parse_html_str(document.content_html)
except ParseError as e:
log.warning(
f"Could not parse HTML for document {document.expression_uri()}: {document.content_html}",
exc_info=e,
)
return False

for matcher in self.matchers:
# the matcher can either manipulate the html, or return a new tree
res = matcher().markup_html_matches(document.expression_uri(), html)
Expand Down
53 changes: 53 additions & 0 deletions peachjam/migrations/0146_clean_content_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Generated by Django 3.2.25 on 2024-07-08 10:08
import re

from django.db import migrations
from django.db.models.functions import Length
from lxml.etree import ParserError

from peachjam.xmlutils import parse_html_str


def clean_content_html(content_html):
"""Ensure that content_html is not just whitespace HTML. Returns the cleaned value."""
if not content_html:
return None

# return None if the HTML doesn't have any content
try:
root = parse_html_str(content_html)
text = "".join(root.itertext()).strip()
text = re.sub(r"\s", "", text)
if not text:
return None
except (ValueError, ParserError):
return None

return content_html


def forwards(apps, schema_editor):
CoreDocument = apps.get_model("peachjam", "CoreDocument")
# get docs where the length of content_html is > 0 but less than 500
qs = CoreDocument.objects.annotate(text_length=Length("content_html")).filter(
content_html_is_akn=False,
content_html__isnull=False,
text_length__gt=0,
text_length__lt=1000,
)
for doc in qs.iterator(100):
html = doc.content_html
doc.content_html = clean_content_html(doc.content_html)
if doc.content_html != html:
doc.save(update_fields=["content_html"])


class Migration(migrations.Migration):

dependencies = [
("peachjam", "0145_alter_courtregistry_options"),
]

operations = [
migrations.RunPython(forwards, migrations.RunPython.noop),
]
22 changes: 20 additions & 2 deletions peachjam/models/core_document_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from docpipe.xmlutils import unwrap_element
from languages_plus.models import Language
from lxml import html
from lxml.etree import ParserError
from polymorphic.managers import PolymorphicManager
from polymorphic.models import PolymorphicModel
from polymorphic.query import PolymorphicQuerySet
Expand Down Expand Up @@ -479,6 +480,23 @@ def full_clean(self, *args, **kwargs):
self.pre_save()
super().full_clean(*args, **kwargs)

def clean_content_html(self, content_html):
"""Ensure that content_html is not just whitespace HTML. Returns the cleaned value."""
if not content_html:
return None

# return None if the HTML doesn't have any content
try:
root = parse_html_str(content_html)
text = "".join(root.itertext()).strip()
text = re.sub(r"\s", "", text)
if not text:
return None
except (ValueError, ParserError):
return None

return content_html

def clean(self):
super().clean()

Expand Down Expand Up @@ -590,7 +608,7 @@ def delete_citations(self):

if self.content_html and not self.content_html_is_akn:
# delete existing citations in html
root = html.fromstring(self.content_html)
root = parse_html_str(self.content_html)
deleted = False
for a in root.xpath('//a[starts-with(@href, "/akn/")]'):
unwrap_element(a)
Expand All @@ -612,7 +630,7 @@ def extract_content_from_source_file(self):
context = PipelineContext(word_pipeline)
context.source_file = self.source_file.file
word_pipeline(context)
self.content_html = context.html_text
self.content_html = self.clean_content_html(context.html_text)

for img in self.images.all():
img.delete()
Expand Down
5 changes: 3 additions & 2 deletions peachjam/models/journals_books.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ def delete_citations(self):
super().delete_citations()
# reset the HTML back to the original from markdown, because delete_citations()
# removes any embedded akn links
self.convert_content_markdown()
if self.content_markdown:
self.convert_content_markdown()

def convert_content_markdown(self):
self.content_html = markdownify(self.content_markdown)
self.content_html = markdownify(self.content_markdown or "")

def pre_save(self):
self.frbr_uri_doctype = "doc"
Expand Down
9 changes: 9 additions & 0 deletions peachjam/tests/test_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,12 @@ def test_journal(self):
journal.save()
self.assertEqual("doc", journal.frbr_uri_doctype)
self.assertEqual("journal", journal.frbr_uri_subtype)

def test_clean_content_html(self):
doc = CoreDocument()
self.assertIsNone(doc.clean_content_html(""""""))
self.assertIsNone(doc.clean_content_html("""<aoeu"""))
self.assertIsNone(doc.clean_content_html("""<div> \n&nbsp; \n</div>"""))
self.assertEqual(
doc.clean_content_html("""<div>test</div>"""), """<div>test</div>"""
)

0 comments on commit 959ce8b

Please sign in to comment.