Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean content_html and handle when it's empty #1896

Merged
merged 4 commits into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion peachjam/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,8 @@ def clean_content_html(self):
# prevent CKEditor-based editing of AKN HTML
if self.instance.content_html_is_akn:
return self.instance.content_html
return self.cleaned_data["content_html"]
# ensure html is clean
return self.instance.clean_content_html(self.cleaned_data["content_html"])


class AttachedFilesInline(BaseAttachmentFileInline):
Expand Down
12 changes: 11 additions & 1 deletion peachjam/analysis/citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
import requests
from django.conf import settings
from docpipe.matchers import ExtractedCitation
from lxml.etree import ParseError

from peachjam.models import CitationLink
from peachjam.xmlutils import parse_html_str

log = logging.getLogger(__name__)

Expand All @@ -28,7 +30,15 @@ def extract_citations(self, document):
return self.extract_citations_from_source_file(document)

def extract_citations_from_html(self, document):
html = lxml.html.fromstring(document.content_html)
try:
html = parse_html_str(document.content_html)
except ParseError as e:
log.warning(
f"Could not parse HTML for document {document.expression_uri()}: {document.content_html}",
exc_info=e,
)
return False

for matcher in self.matchers:
# the matcher can either manipulate the html, or return a new tree
res = matcher().markup_html_matches(document.expression_uri(), html)
Expand Down
53 changes: 53 additions & 0 deletions peachjam/migrations/0146_clean_content_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Generated by Django 3.2.25 on 2024-07-08 10:08
import re

from django.db import migrations
from django.db.models.functions import Length
from lxml.etree import ParserError

from peachjam.xmlutils import parse_html_str


def clean_content_html(content_html):
"""Ensure that content_html is not just whitespace HTML. Returns the cleaned value."""
if not content_html:
return None

# return None if the HTML doesn't have any content
try:
root = parse_html_str(content_html)
text = "".join(root.itertext()).strip()
text = re.sub(r"\s", "", text)
if not text:
return None
except (ValueError, ParserError):
return None

return content_html


def forwards(apps, schema_editor):
CoreDocument = apps.get_model("peachjam", "CoreDocument")
# get docs where the length of content_html is > 0 but less than 500
qs = CoreDocument.objects.annotate(text_length=Length("content_html")).filter(
content_html_is_akn=False,
content_html__isnull=False,
text_length__gt=0,
text_length__lt=1000,
)
for doc in qs.iterator(100):
html = doc.content_html
doc.content_html = clean_content_html(doc.content_html)
if doc.content_html != html:
doc.save(update_fields=["content_html"])


class Migration(migrations.Migration):

dependencies = [
("peachjam", "0145_alter_courtregistry_options"),
]

operations = [
migrations.RunPython(forwards, migrations.RunPython.noop),
]
22 changes: 20 additions & 2 deletions peachjam/models/core_document_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from docpipe.xmlutils import unwrap_element
from languages_plus.models import Language
from lxml import html
from lxml.etree import ParserError
from polymorphic.managers import PolymorphicManager
from polymorphic.models import PolymorphicModel
from polymorphic.query import PolymorphicQuerySet
Expand Down Expand Up @@ -479,6 +480,23 @@ def full_clean(self, *args, **kwargs):
self.pre_save()
super().full_clean(*args, **kwargs)

def clean_content_html(self, content_html):
"""Ensure that content_html is not just whitespace HTML. Returns the cleaned value."""
if not content_html:
return None

# return None if the HTML doesn't have any content
try:
root = parse_html_str(content_html)
text = "".join(root.itertext()).strip()
text = re.sub(r"\s", "", text)
if not text:
return None
except (ValueError, ParserError):
return None

return content_html

def clean(self):
super().clean()

Expand Down Expand Up @@ -590,7 +608,7 @@ def delete_citations(self):

if self.content_html and not self.content_html_is_akn:
# delete existing citations in html
root = html.fromstring(self.content_html)
root = parse_html_str(self.content_html)
deleted = False
for a in root.xpath('//a[starts-with(@href, "/akn/")]'):
unwrap_element(a)
Expand All @@ -612,7 +630,7 @@ def extract_content_from_source_file(self):
context = PipelineContext(word_pipeline)
context.source_file = self.source_file.file
word_pipeline(context)
self.content_html = context.html_text
self.content_html = self.clean_content_html(context.html_text)

for img in self.images.all():
img.delete()
Expand Down
5 changes: 3 additions & 2 deletions peachjam/models/journals_books.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ def delete_citations(self):
super().delete_citations()
# reset the HTML back to the original from markdown, because delete_citations()
# removes any embedded akn links
self.convert_content_markdown()
if self.content_markdown:
self.convert_content_markdown()

def convert_content_markdown(self):
self.content_html = markdownify(self.content_markdown)
self.content_html = markdownify(self.content_markdown or "")

def pre_save(self):
self.frbr_uri_doctype = "doc"
Expand Down
9 changes: 9 additions & 0 deletions peachjam/tests/test_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,12 @@ def test_journal(self):
journal.save()
self.assertEqual("doc", journal.frbr_uri_doctype)
self.assertEqual("journal", journal.frbr_uri_subtype)

def test_clean_content_html(self):
doc = CoreDocument()
self.assertIsNone(doc.clean_content_html(""""""))
self.assertIsNone(doc.clean_content_html("""<aoeu"""))
self.assertIsNone(doc.clean_content_html("""<div> \n&nbsp; \n</div>"""))
self.assertEqual(
doc.clean_content_html("""<div>test</div>"""), """<div>test</div>"""
)
Loading