diff --git a/web/main/case_xml_converter.py b/web/main/case_xml_converter.py
new file mode 100644
index 000000000..f52b4ff32
--- /dev/null
+++ b/web/main/case_xml_converter.py
@@ -0,0 +1,151 @@
+"""
+Convert between XML and HTML versions of CAP's formatted case data.
+"""
+
+import lxml.sax
+import lxml.html
+import xml.sax
+
+from lxml import etree
+
+# sax functions passed to render_sax_tags
+sax_start = lxml.sax.ElementTreeContentHandler.startElement
+sax_end = lxml.sax.ElementTreeContentHandler.endElement
+sax_chars = lxml.sax.ElementTreeContentHandler.characters
+
+mapping = {
+ "casebody": "section",
+ "parties": "h4",
+ "docketnumber": "p",
+ "court": "p",
+ "decisiondate": "p",
+ "otherdate": "p",
+ "attorneys": "p",
+ "opinion": "article",
+ "author": "p",
+ "page-number": "a",
+ "extracted-citation": "a",
+ "bracketnum": "a",
+ "footnotemark": "a",
+}
+
+
+def render_sax_tags(tag_stack):
+ # run all of our commands, like "sax_start(*args)", to actually build the xml tree
+ handler = lxml.sax.ElementTreeContentHandler()
+ for method, args in tag_stack:
+ method(handler, *args)
+ return handler._root
+
+
+class XmlToHtmlHandler(xml.sax.ContentHandler):
+ def __init__(self, case_id):
+ self.tag_stack = []
+ self.case_id = case_id
+ self.head_matter_open = False
+
+ def startElement(self, name, attrs):
+
+ if name == "casebody":
+ self.tag_stack.append(
+ (
+ sax_start,
+ (
+ "section",
+ {
+ "class": "casebody",
+ "data-case-id": self.case_id,
+ "data-firstpage": attrs["firstpage"],
+ "data-lastpage": attrs["lastpage"],
+ },
+ ),
+ )
+ )
+ self.tag_stack.append((sax_chars, ("\n ",)))
+ self.tag_stack.append((sax_start, ("section", {"class": "head-matter"})))
+ self.head_matter_open = True
+ elif name == "opinion":
+ if self.head_matter_open:
+ self.close_head_matter()
+ # set opinion type to 'none' for opinions that don't have 'type' in source xml
+ attr_type = attrs.get("type", "none")
+ self.tag_stack.append(
+ (sax_start, ("article", {"class": "opinion", "data-type": attr_type}))
+ )
+ elif name == "page-number":
+ label = attrs["label"]
+ self.tag_stack.append(
+ (
+ sax_start,
+ (
+ "a",
+ {
+ "id": "p" + label,
+ "href": f"#p{label}",
+ "data-label": label,
+ "data-citation-index": attrs["citation-index"],
+ "class": "page-label",
+ },
+ ),
+ )
+ )
+ elif name == "extracted-citation":
+ new_attrs = {"href": attrs["url"], "class": "citation", "data-index": attrs["index"]}
+ if "case-ids" in attrs:
+ new_attrs["data-case-ids"] = attrs["case-ids"]
+ self.tag_stack.append((sax_start, ("a", new_attrs)))
+ elif name in ("footnotemark", "bracketnum"):
+ new_attrs = {"class": name}
+ if "href" in attrs:
+ new_attrs["href"] = attrs["href"]
+ if "id" in attrs:
+ new_attrs["id"] = attrs["id"]
+ self.tag_stack.append((sax_start, ("a", new_attrs)))
+ elif name in (
+ "parties",
+ "docketnumber",
+ "court",
+ "decisiondate",
+ "otherdate",
+ "attorneys",
+ "author",
+ "p",
+ "blockquote",
+ ):
+ # content element
+ # set id to 'none' for elements that don't have 'id' in source xml
+ attrs_id = attrs.get("id", "none")
+ attrs = {"id": attrs_id}
+ if "data-blocks" in attrs:
+ attrs["data-blocks"] = attrs["data-blocks"]
+ if name not in ("p", "blockquote"):
+ attrs["class"] = name
+ new_name = "h4" if name == "parties" else "blockquote" if name == "blockquote" else "p"
+ if self.head_matter_open:
+ self.tag_stack.append((sax_chars, (" ",)))
+ self.tag_stack.append((sax_start, (new_name, attrs)))
+ else:
+ # passthrough
+ self.tag_stack.append((sax_start, (name, attrs)))
+
+ def characters(self, text):
+ if self.head_matter_open and text == " ":
+ text = " "
+ self.tag_stack.append((sax_chars, (text,)))
+
+ def endElement(self, name):
+ if name == "casebody" and self.head_matter_open:
+ self.close_head_matter()
+ self.tag_stack.append((sax_end, (mapping.get(name, name),)))
+
+ def close_head_matter(self):
+ self.tag_stack.append((sax_end, ("section",)))
+ self.tag_stack.append((sax_chars, ("\n ",)))
+ self.head_matter_open = False
+
+
+def xml_to_html(input, case_id):
+ handler = XmlToHtmlHandler(case_id)
+ xml.sax.parseString(input, handler)
+ tree = render_sax_tags(handler.tag_stack)
+ return etree.tostring(tree, encoding=str, method="html")
diff --git a/web/main/legal_document_sources.py b/web/main/legal_document_sources.py
index 92f573018..a178b9827 100644
--- a/web/main/legal_document_sources.py
+++ b/web/main/legal_document_sources.py
@@ -13,8 +13,13 @@
from django.conf import settings
from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
from pyquery import PyQuery
+from main.case_xml_converter import xml_to_html
-from main.utils import APICommunicationError, looks_like_case_law_link, looks_like_citation
+from main.utils import (
+ APICommunicationError,
+ looks_like_case_law_link,
+ looks_like_citation,
+)
vs_check = re.compile(" [vV][sS]?[.]? ")
@@ -519,8 +524,8 @@ def header_template(legal_document):
class CourtListener:
details = {
"name": "CourtListener",
- "short_description": "hello",
- "long_description": "CourtListener searches millions of opinions across hundreds of jurisdictions",
+ "short_description": "CourtListener contains millions of legal opinions.",
+ "long_description": "CourtListener searches millions of opinions across hundreds of jurisdictions.",
"link": settings.COURTLISTENER_BASE_URL,
"search_regexes": [],
"footnote_regexes": [],
@@ -532,11 +537,7 @@ def search(search_params):
if not settings.COURTLISTENER_API_KEY:
raise APICommunicationError("A CourtListener API key is required")
try:
- params = (
- {"citation": search_params.q}
- if looks_like_citation(search_params.q)
- else {"q": search_params.q}
- )
+ params = CourtListener.get_search_params(search_params)
resp = requests.get(
f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/search",
params,
@@ -552,13 +553,16 @@ def search(search_params):
results.append(
{
"fullName": r["caseName"],
- "shortName": r["caseName"],
- "fullCitations": ", ".join(r["citation"]),
- "shortCitations": ", ".join(r["citation"][:3])
- + ("..." if len(r["citation"]) > 3 else ""),
- "effectiveDate": parser.isoparse(r["dateFiled"]).strftime("%Y-%m-%d"),
+ "shortName": truncate_name(r["caseName"]),
+ "fullCitations": ", ".join(r["citation"]) if r["citation"] else "",
+ "shortCitations": (
+ ", ".join(r["citation"][:3]) + ("..." if len(r["citation"]) > 3 else "")
+ if r["citation"]
+ else ""
+ ),
+ "effectiveDate": parser.isoparse(r["dateFiled"][:25]).strftime("%Y-%m-%d"),
"url": f"{settings.COURTLISTENER_BASE_URL}{r['absolute_url']}",
- "id": r["id"],
+ "id": r["cluster_id"],
}
)
return results
@@ -576,38 +580,132 @@ def pull(legal_doc_source, id):
)
resp.raise_for_status()
cluster = resp.json()
- resp = requests.get(
- f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/opinions/{id}/",
- headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"},
- )
- resp.raise_for_status()
-
- opinion = resp.json()
+ cluster["html_info"] = {"source": "court listener"}
+ cluster["sub_opinions"].sort(key=lambda url: int(url.split("/")[-2]))
+
+ sub_opinion_jsons = []
+ for opinion in cluster["sub_opinions"]:
+ sub_opinion_jsons.append(CourtListener.get_opinion_body(opinion))
+
+ text_source = ""
+ for content_type in (
+ "xml_harvard",
+ "html_with_citations",
+ "html_columbia",
+ "html_lawbox",
+ "html_anon_2020",
+ "html",
+ "plain_text",
+ ):
+ case_text = "".join(sub_opinion[content_type] for sub_opinion in sub_opinion_jsons)
+ if case_text:
+ case_text = case_text.replace('', "")
+ text_source = content_type
+ break
+
+ if not case_text:
+ msg = f"Case text not found for cluster {id}"
+ raise Exception(msg)
+
+ if text_source == "xml_harvard":
+ case_text = CourtListener.prepare_case_html(cluster, case_text)
+
+ cluster["html_info"]["source_field"] = text_source
+ additional_metadata = (CourtListener.get_additional_cluster_metadata(id))["results"][0]
except requests.exceptions.HTTPError as e:
msg = f"Failed call to {resp.request.url}: {e}\n{resp.content}"
raise APICommunicationError(msg)
- body = opinion["html"]
+ citations = [
+ f"{x.get('volume')} {x.get('reporter')} {x.get('page')}" for x in cluster["citations"]
+ ]
+
+ # https://www.courtlistener.com/help/api/rest/#case-names
+ case_name = cluster["case_name"] or cluster["case_name_full"][:10000]
+ cluster["court"] = {"name": additional_metadata.get("court")}
+ cluster["docket_number"] = additional_metadata.get("docketNumber")
+
case = LegalDocument(
source=legal_doc_source,
- short_name=cluster["case_name"],
- name=cluster["case_name"],
+ short_name=cluster.get("case_name"),
+ name=case_name,
doc_class="Case",
- citations=cluster["citations"],
- jurisdiction="",
- effective_date=cluster["date_filed"],
- publication_date=cluster["date_filed"],
+ citations=citations,
+ jurisdiction=cluster.get("court_id"),
+ effective_date=parser.parse(cluster.get("date_filed")),
+ publication_date=parser.parse(cluster.get("date_modified")),
updated_date=datetime.now(),
source_ref=str(id),
- content=body,
- metadata=None,
+ content=case_text,
+ metadata=cluster,
)
+
return case
@staticmethod
def header_template(legal_document):
- return "empty_header.html"
+ return "court_listener_header.html"
+
+ @staticmethod
+ def get_opinion_body(sub_opinion_url):
+ opinion_num = int(sub_opinion_url.split("/")[-2])
+ resp = requests.get(
+ f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/opinions/{opinion_num}/",
+ headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"},
+ )
+
+ resp.raise_for_status()
+ return resp.json()
+
+ @staticmethod
+ def prepare_case_html(cluster, opinions_xml):
+ xml_declaration = (
+ "\n
", "").replace('https:=""', "")
+
+ try:
+ converted_case_html = xml_to_html(case_xml, str(cluster["id"]))
+ except Exception as e:
+ msg = f"Error converting xml to html for case {cluster['id']}: {e}"
+ raise Exception(msg)
+
+ return converted_case_html
+
+ @staticmethod
+ def get_search_params(search_params):
+ search_type_param = (
+ {"citation": search_params.q}
+ if looks_like_citation(search_params.q)
+ else {"q": search_params.q}
+ )
+ search_params = {
+ "filed_after": search_params.after_date,
+ "filed_before": search_params.before_date,
+ "court": search_params.jurisdiction,
+ }
+ params = {**search_type_param, **search_params}
+ return {k: params[k] for k in params.keys() if params[k] is not None}
+
+ @staticmethod
+ def get_additional_cluster_metadata(cluster_id):
+ """
+ Additional metadata about a cluster such as court and docket number are available in search endpoint
+ Instead of clusters endpoint
+ """
+ params = {"q": f"cluster_id:{cluster_id}"}
+
+ resp = requests.get(
+ f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/search",
+ params,
+ headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"},
+ )
+
+ resp.raise_for_status()
+ return resp.json()
class LegacyNoSearch:
diff --git a/web/main/templates/export/as_printable_html/node.html b/web/main/templates/export/as_printable_html/node.html
index 2b2295596..e433cb750 100644
--- a/web/main/templates/export/as_printable_html/node.html
+++ b/web/main/templates/export/as_printable_html/node.html
@@ -57,7 +57,11 @@