feed headmatter into xml as well, add jack's script

harvard-lil · Jun 12, 2024 · 041007a · 041007a
1 parent 1d7be58
commit 041007a
Show file tree

Hide file tree

Showing 2 changed files with 160 additions and 35 deletions.
diff --git a/web/main/case_xml_converter.py b/web/main/case_xml_converter.py
@@ -0,0 +1,149 @@
+"""
+Convert between XML and HTML versions of CAP's formatted case data.
+"""
+
+from pprint import pprint
+
+import lxml.sax
+import lxml.html
+import xml.sax
+
+from lxml import etree
+
+# sax functions passed to render_sax_tags
+sax_start = lxml.sax.ElementTreeContentHandler.startElement
+sax_end = lxml.sax.ElementTreeContentHandler.endElement
+sax_chars = lxml.sax.ElementTreeContentHandler.characters
+
+mapping = {
+    "casebody": "section",
+    "parties": "h4",
+    "docketnumber": "p",
+    "court": "p",
+    "decisiondate": "p",
+    "otherdate": "p",
+    "attorneys": "p",
+    "opinion": "article",
+    "author": "p",
+    "page-number": "a",
+    "extracted-citation": "a",
+    "bracketnum": "a",
+    "footnotemark": "a",
+}
+
+
+def render_sax_tags(tag_stack):
+    # run all of our commands, like "sax_start(*args)", to actually build the xml tree
+    handler = lxml.sax.ElementTreeContentHandler()
+    for method, args in tag_stack:
+        method(handler, *args)
+    return handler._root
+
+
+class XmlToHtmlHandler(xml.sax.ContentHandler):
+    def __init__(self, case_id):
+        self.tag_stack = []
+        self.case_id = case_id
+        self.head_matter_open = False
+
+    def startElement(self, name, attrs):
+
+        if name == "casebody":
+            self.tag_stack.append(
+                (
+                    sax_start,
+                    (
+                        "section",
+                        {
+                            "class": "casebody",
+                            "data-case-id": self.case_id,
+                            "data-firstpage": attrs["firstpage"],
+                            "data-lastpage": attrs["lastpage"],
+                        },
+                    ),
+                )
+            )
+            self.tag_stack.append((sax_chars, ("\n  ",)))
+            self.tag_stack.append((sax_start, ("section", {"class": "head-matter"})))
+            self.head_matter_open = True
+        elif name == "opinion":
+            if self.head_matter_open:
+                self.close_head_matter()
+            self.tag_stack.append(
+                (sax_start, ("article", {"class": "opinion", "data-type": attrs["type"]}))
+            )
+        elif name == "page-number":
+            label = attrs["label"]
+            self.tag_stack.append(
+                (
+                    sax_start,
+                    (
+                        "a",
+                        {
+                            "id": "p" + label,
+                            "href": f"#p" + label,
+                            "data-label": label,
+                            "data-citation-index": attrs["citation-index"],
+                            "class": "page-label",
+                        },
+                    ),
+                )
+            )
+        elif name == "extracted-citation":
+            new_attrs = {"href": attrs["url"], "class": "citation", "data-index": attrs["index"]}
+            if "case-ids" in attrs:
+                new_attrs["data-case-ids"] = attrs["case-ids"]
+            self.tag_stack.append((sax_start, ("a", new_attrs)))
+        elif name in ("footnotemark", "bracketnum"):
+            new_attrs = {"class": name}
+            if "href" in attrs:
+                new_attrs["href"] = attrs["href"]
+            if "id" in attrs:
+                new_attrs["id"] = attrs["id"]
+            self.tag_stack.append((sax_start, ("a", new_attrs)))
+        elif name in (
+            "parties",
+            "docketnumber",
+            "court",
+            "decisiondate",
+            "otherdate",
+            "attorneys",
+            "author",
+            "p",
+            "blockquote",
+        ):
+            # content element
+            attrs = {"id": attrs["id"]}
+            if "data-blocks" in attrs:
+                attrs["data-blocks"] = attrs["data-blocks"]
+            if name not in ("p", "blockquote"):
+                attrs["class"] = name
+            new_name = "h4" if name == "parties" else "blockquote" if name == "blockquote" else "p"
+            if self.head_matter_open:
+                self.tag_stack.append((sax_chars, ("  ",)))
+            self.tag_stack.append((sax_start, (new_name, attrs)))
+        else:
+            # passthrough
+            self.tag_stack.append((sax_start, (name, attrs)))
+
+    def characters(self, text):
+        if self.head_matter_open and text == "    ":
+            text = "      "
+        self.tag_stack.append((sax_chars, (text,)))
+
+    def endElement(self, name):
+        if name == "casebody" and self.head_matter_open:
+            self.close_head_matter()
+        self.tag_stack.append((sax_end, (mapping.get(name, name),)))
+
+    def close_head_matter(self):
+        self.tag_stack.append((sax_end, ("section",)))
+        self.tag_stack.append((sax_chars, ("\n  ",)))
+        self.head_matter_open = False
+
+
+def xml_to_html(input, case_id):
+    handler = XmlToHtmlHandler(case_id)
+    xml.sax.parseString(input, handler)
+    tree = render_sax_tags(handler.tag_stack)
+    return etree.tostring(tree, encoding=str, method="html")
diff --git a/web/main/legal_document_sources.py b/web/main/legal_document_sources.py
@@ -13,12 +13,12 @@
 from django.conf import settings
 from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
 from pyquery import PyQuery
+from .case_xml_converter import xml_to_html
 
 from main.utils import (
     APICommunicationError,
     looks_like_case_law_link,
     looks_like_citation,
-    convert_case_xml_to_html,
 )
 
 vs_check = re.compile(" [vV][sS]?[.]? ")
@@ -641,14 +641,17 @@ def prepare_case_html(cluster, opinions_xml):
             "<casebody xmlns='http://nrs.harvard.edu/urn-3:HLS.Libr.US_Case_Law.Schema.Case_Body:v1' "
             "firstpage='0' lastpage='0'>"
         )
-        case_xml = f"{xml_declaration}\n{opinions_xml}</casebody>"
-        converted_case_html = convert_case_xml_to_html(case_xml)
-        formatted_headmatter_html = CourtListener.format_headmatter(cluster["headmatter"])
+        case_xml = f"{xml_declaration}\n{cluster['headmatter']}\n{opinions_xml}</casebody>"
+        # mismatched tag error workaround
+        case_xml = case_xml.replace("<br>", "")
 
-        if formatted_headmatter_html:
-            return formatted_headmatter_html + converted_case_html
-        else:
-            return converted_case_html
+        try:
+            converted_case_html = xml_to_html(case_xml, str(cluster["id"]))
+        except Exception as e:
+            msg = f"Error converting xml to html: {e}"
+            raise Exception(msg)
+
+        return converted_case_html
 
     @staticmethod
     def cl_params(search_params):
@@ -665,33 +668,6 @@ def cl_params(search_params):
         params = {**search_type_param, **search_params}
         return {k: params[k] for k in params.keys() if params[k] is not None}
 
-    @staticmethod
-    def format_headmatter(headmatter_str):
-        replacements = {
-            "\n": "",
-            "<parties": '<h4 class="parties"',
-            "</parties>": "</h4>",
-            "<docketnumber": '<p class="docketnumber"',
-            "<otherdate": '<p class="otherdate"',
-            "<decisiondate": '<p class="decisiondate"',
-            "<attorneys": '<p class="attorneys"',
-            "</docketnumber>": "</p>",
-            "</otherdate>": "</p>",
-            "</decisiondate>": "</p>",
-            "</attorneys>": "</p>",
-            "<br>": "",
-        }
-
-        try:
-            pattern = "|".join(replacements.keys())
-            cleaned_headmatter = re.sub(
-                pattern, lambda match: replacements[match.group(0)], headmatter_str
-            )
-        except TypeError:
-            cleaned_headmatter = None
-
-        return cleaned_headmatter
-
 
 class LegacyNoSearch:
     details = {