Skip to content

Commit

Permalink
feed headmatter into xml as well, add jack's script
Browse files Browse the repository at this point in the history
  • Loading branch information
teovin committed Jun 12, 2024
1 parent 1d7be58 commit 041007a
Show file tree
Hide file tree
Showing 2 changed files with 160 additions and 35 deletions.
149 changes: 149 additions & 0 deletions web/main/case_xml_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
"""
Convert between XML and HTML versions of CAP's formatted case data.
"""

from pprint import pprint

import lxml.sax
import lxml.html
import xml.sax

from lxml import etree

# sax functions passed to render_sax_tags
sax_start = lxml.sax.ElementTreeContentHandler.startElement
sax_end = lxml.sax.ElementTreeContentHandler.endElement
sax_chars = lxml.sax.ElementTreeContentHandler.characters

mapping = {
"casebody": "section",
"parties": "h4",
"docketnumber": "p",
"court": "p",
"decisiondate": "p",
"otherdate": "p",
"attorneys": "p",
"opinion": "article",
"author": "p",
"page-number": "a",
"extracted-citation": "a",
"bracketnum": "a",
"footnotemark": "a",
}


def render_sax_tags(tag_stack):
# run all of our commands, like "sax_start(*args)", to actually build the xml tree
handler = lxml.sax.ElementTreeContentHandler()
for method, args in tag_stack:
method(handler, *args)
return handler._root


class XmlToHtmlHandler(xml.sax.ContentHandler):
def __init__(self, case_id):
self.tag_stack = []
self.case_id = case_id
self.head_matter_open = False

def startElement(self, name, attrs):

if name == "casebody":
self.tag_stack.append(
(
sax_start,
(
"section",
{
"class": "casebody",
"data-case-id": self.case_id,
"data-firstpage": attrs["firstpage"],
"data-lastpage": attrs["lastpage"],
},
),
)
)
self.tag_stack.append((sax_chars, ("\n ",)))
self.tag_stack.append((sax_start, ("section", {"class": "head-matter"})))
self.head_matter_open = True
elif name == "opinion":
if self.head_matter_open:
self.close_head_matter()
self.tag_stack.append(
(sax_start, ("article", {"class": "opinion", "data-type": attrs["type"]}))
)
elif name == "page-number":
label = attrs["label"]
self.tag_stack.append(
(
sax_start,
(
"a",
{
"id": "p" + label,
"href": f"#p" + label,
"data-label": label,
"data-citation-index": attrs["citation-index"],
"class": "page-label",
},
),
)
)
elif name == "extracted-citation":
new_attrs = {"href": attrs["url"], "class": "citation", "data-index": attrs["index"]}
if "case-ids" in attrs:
new_attrs["data-case-ids"] = attrs["case-ids"]
self.tag_stack.append((sax_start, ("a", new_attrs)))
elif name in ("footnotemark", "bracketnum"):
new_attrs = {"class": name}
if "href" in attrs:
new_attrs["href"] = attrs["href"]
if "id" in attrs:
new_attrs["id"] = attrs["id"]
self.tag_stack.append((sax_start, ("a", new_attrs)))
elif name in (
"parties",
"docketnumber",
"court",
"decisiondate",
"otherdate",
"attorneys",
"author",
"p",
"blockquote",
):
# content element
attrs = {"id": attrs["id"]}
if "data-blocks" in attrs:
attrs["data-blocks"] = attrs["data-blocks"]
if name not in ("p", "blockquote"):
attrs["class"] = name
new_name = "h4" if name == "parties" else "blockquote" if name == "blockquote" else "p"
if self.head_matter_open:
self.tag_stack.append((sax_chars, (" ",)))
self.tag_stack.append((sax_start, (new_name, attrs)))
else:
# passthrough
self.tag_stack.append((sax_start, (name, attrs)))

def characters(self, text):
if self.head_matter_open and text == " ":
text = " "
self.tag_stack.append((sax_chars, (text,)))

def endElement(self, name):
if name == "casebody" and self.head_matter_open:
self.close_head_matter()
self.tag_stack.append((sax_end, (mapping.get(name, name),)))

def close_head_matter(self):
self.tag_stack.append((sax_end, ("section",)))
self.tag_stack.append((sax_chars, ("\n ",)))
self.head_matter_open = False


def xml_to_html(input, case_id):
handler = XmlToHtmlHandler(case_id)
xml.sax.parseString(input, handler)
tree = render_sax_tags(handler.tag_stack)
return etree.tostring(tree, encoding=str, method="html")
46 changes: 11 additions & 35 deletions web/main/legal_document_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
from django.conf import settings
from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
from pyquery import PyQuery
from .case_xml_converter import xml_to_html

from main.utils import (
APICommunicationError,
looks_like_case_law_link,
looks_like_citation,
convert_case_xml_to_html,
)

vs_check = re.compile(" [vV][sS]?[.]? ")
Expand Down Expand Up @@ -641,14 +641,17 @@ def prepare_case_html(cluster, opinions_xml):
"<casebody xmlns='http://nrs.harvard.edu/urn-3:HLS.Libr.US_Case_Law.Schema.Case_Body:v1' "
"firstpage='0' lastpage='0'>"
)
case_xml = f"{xml_declaration}\n{opinions_xml}</casebody>"
converted_case_html = convert_case_xml_to_html(case_xml)
formatted_headmatter_html = CourtListener.format_headmatter(cluster["headmatter"])
case_xml = f"{xml_declaration}\n{cluster['headmatter']}\n{opinions_xml}</casebody>"
# mismatched tag error workaround
case_xml = case_xml.replace("<br>", "")

if formatted_headmatter_html:
return formatted_headmatter_html + converted_case_html
else:
return converted_case_html
try:
converted_case_html = xml_to_html(case_xml, str(cluster["id"]))
except Exception as e:
msg = f"Error converting xml to html: {e}"
raise Exception(msg)

return converted_case_html

@staticmethod
def cl_params(search_params):
Expand All @@ -665,33 +668,6 @@ def cl_params(search_params):
params = {**search_type_param, **search_params}
return {k: params[k] for k in params.keys() if params[k] is not None}

@staticmethod
def format_headmatter(headmatter_str):
replacements = {
"\n": "",
"<parties": '<h4 class="parties"',
"</parties>": "</h4>",
"<docketnumber": '<p class="docketnumber"',
"<otherdate": '<p class="otherdate"',
"<decisiondate": '<p class="decisiondate"',
"<attorneys": '<p class="attorneys"',
"</docketnumber>": "</p>",
"</otherdate>": "</p>",
"</decisiondate>": "</p>",
"</attorneys>": "</p>",
"<br>": "",
}

try:
pattern = "|".join(replacements.keys())
cleaned_headmatter = re.sub(
pattern, lambda match: replacements[match.group(0)], headmatter_str
)
except TypeError:
cleaned_headmatter = None

return cleaned_headmatter


class LegacyNoSearch:
details = {
Expand Down

0 comments on commit 041007a

Please sign in to comment.