diff --git a/web/main/case_xml_converter.py b/web/main/case_xml_converter.py new file mode 100644 index 000000000..f52b4ff32 --- /dev/null +++ b/web/main/case_xml_converter.py @@ -0,0 +1,151 @@ +""" +Convert between XML and HTML versions of CAP's formatted case data. +""" + +import lxml.sax +import lxml.html +import xml.sax + +from lxml import etree + +# sax functions passed to render_sax_tags +sax_start = lxml.sax.ElementTreeContentHandler.startElement +sax_end = lxml.sax.ElementTreeContentHandler.endElement +sax_chars = lxml.sax.ElementTreeContentHandler.characters + +mapping = { + "casebody": "section", + "parties": "h4", + "docketnumber": "p", + "court": "p", + "decisiondate": "p", + "otherdate": "p", + "attorneys": "p", + "opinion": "article", + "author": "p", + "page-number": "a", + "extracted-citation": "a", + "bracketnum": "a", + "footnotemark": "a", +} + + +def render_sax_tags(tag_stack): + # run all of our commands, like "sax_start(*args)", to actually build the xml tree + handler = lxml.sax.ElementTreeContentHandler() + for method, args in tag_stack: + method(handler, *args) + return handler._root + + +class XmlToHtmlHandler(xml.sax.ContentHandler): + def __init__(self, case_id): + self.tag_stack = [] + self.case_id = case_id + self.head_matter_open = False + + def startElement(self, name, attrs): + + if name == "casebody": + self.tag_stack.append( + ( + sax_start, + ( + "section", + { + "class": "casebody", + "data-case-id": self.case_id, + "data-firstpage": attrs["firstpage"], + "data-lastpage": attrs["lastpage"], + }, + ), + ) + ) + self.tag_stack.append((sax_chars, ("\n ",))) + self.tag_stack.append((sax_start, ("section", {"class": "head-matter"}))) + self.head_matter_open = True + elif name == "opinion": + if self.head_matter_open: + self.close_head_matter() + # set opinion type to 'none' for opinions that don't have 'type' in source xml + attr_type = attrs.get("type", "none") + self.tag_stack.append( + (sax_start, ("article", {"class": "opinion", "data-type": attr_type})) + ) + elif name == "page-number": + label = attrs["label"] + self.tag_stack.append( + ( + sax_start, + ( + "a", + { + "id": "p" + label, + "href": f"#p{label}", + "data-label": label, + "data-citation-index": attrs["citation-index"], + "class": "page-label", + }, + ), + ) + ) + elif name == "extracted-citation": + new_attrs = {"href": attrs["url"], "class": "citation", "data-index": attrs["index"]} + if "case-ids" in attrs: + new_attrs["data-case-ids"] = attrs["case-ids"] + self.tag_stack.append((sax_start, ("a", new_attrs))) + elif name in ("footnotemark", "bracketnum"): + new_attrs = {"class": name} + if "href" in attrs: + new_attrs["href"] = attrs["href"] + if "id" in attrs: + new_attrs["id"] = attrs["id"] + self.tag_stack.append((sax_start, ("a", new_attrs))) + elif name in ( + "parties", + "docketnumber", + "court", + "decisiondate", + "otherdate", + "attorneys", + "author", + "p", + "blockquote", + ): + # content element + # set id to 'none' for elements that don't have 'id' in source xml + attrs_id = attrs.get("id", "none") + attrs = {"id": attrs_id} + if "data-blocks" in attrs: + attrs["data-blocks"] = attrs["data-blocks"] + if name not in ("p", "blockquote"): + attrs["class"] = name + new_name = "h4" if name == "parties" else "blockquote" if name == "blockquote" else "p" + if self.head_matter_open: + self.tag_stack.append((sax_chars, (" ",))) + self.tag_stack.append((sax_start, (new_name, attrs))) + else: + # passthrough + self.tag_stack.append((sax_start, (name, attrs))) + + def characters(self, text): + if self.head_matter_open and text == " ": + text = " " + self.tag_stack.append((sax_chars, (text,))) + + def endElement(self, name): + if name == "casebody" and self.head_matter_open: + self.close_head_matter() + self.tag_stack.append((sax_end, (mapping.get(name, name),))) + + def close_head_matter(self): + self.tag_stack.append((sax_end, ("section",))) + self.tag_stack.append((sax_chars, ("\n ",))) + self.head_matter_open = False + + +def xml_to_html(input, case_id): + handler = XmlToHtmlHandler(case_id) + xml.sax.parseString(input, handler) + tree = render_sax_tags(handler.tag_stack) + return etree.tostring(tree, encoding=str, method="html") diff --git a/web/main/legal_document_sources.py b/web/main/legal_document_sources.py index 92f573018..a178b9827 100644 --- a/web/main/legal_document_sources.py +++ b/web/main/legal_document_sources.py @@ -13,8 +13,13 @@ from django.conf import settings from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector from pyquery import PyQuery +from main.case_xml_converter import xml_to_html -from main.utils import APICommunicationError, looks_like_case_law_link, looks_like_citation +from main.utils import ( + APICommunicationError, + looks_like_case_law_link, + looks_like_citation, +) vs_check = re.compile(" [vV][sS]?[.]? ") @@ -519,8 +524,8 @@ def header_template(legal_document): class CourtListener: details = { "name": "CourtListener", - "short_description": "hello", - "long_description": "CourtListener searches millions of opinions across hundreds of jurisdictions", + "short_description": "CourtListener contains millions of legal opinions.", + "long_description": "CourtListener searches millions of opinions across hundreds of jurisdictions.", "link": settings.COURTLISTENER_BASE_URL, "search_regexes": [], "footnote_regexes": [], @@ -532,11 +537,7 @@ def search(search_params): if not settings.COURTLISTENER_API_KEY: raise APICommunicationError("A CourtListener API key is required") try: - params = ( - {"citation": search_params.q} - if looks_like_citation(search_params.q) - else {"q": search_params.q} - ) + params = CourtListener.get_search_params(search_params) resp = requests.get( f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/search", params, @@ -552,13 +553,16 @@ def search(search_params): results.append( { "fullName": r["caseName"], - "shortName": r["caseName"], - "fullCitations": ", ".join(r["citation"]), - "shortCitations": ", ".join(r["citation"][:3]) - + ("..." if len(r["citation"]) > 3 else ""), - "effectiveDate": parser.isoparse(r["dateFiled"]).strftime("%Y-%m-%d"), + "shortName": truncate_name(r["caseName"]), + "fullCitations": ", ".join(r["citation"]) if r["citation"] else "", + "shortCitations": ( + ", ".join(r["citation"][:3]) + ("..." if len(r["citation"]) > 3 else "") + if r["citation"] + else "" + ), + "effectiveDate": parser.isoparse(r["dateFiled"][:25]).strftime("%Y-%m-%d"), "url": f"{settings.COURTLISTENER_BASE_URL}{r['absolute_url']}", - "id": r["id"], + "id": r["cluster_id"], } ) return results @@ -576,38 +580,132 @@ def pull(legal_doc_source, id): ) resp.raise_for_status() cluster = resp.json() - resp = requests.get( - f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/opinions/{id}/", - headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"}, - ) - resp.raise_for_status() - - opinion = resp.json() + cluster["html_info"] = {"source": "court listener"} + cluster["sub_opinions"].sort(key=lambda url: int(url.split("/")[-2])) + + sub_opinion_jsons = [] + for opinion in cluster["sub_opinions"]: + sub_opinion_jsons.append(CourtListener.get_opinion_body(opinion)) + + text_source = "" + for content_type in ( + "xml_harvard", + "html_with_citations", + "html_columbia", + "html_lawbox", + "html_anon_2020", + "html", + "plain_text", + ): + case_text = "".join(sub_opinion[content_type] for sub_opinion in sub_opinion_jsons) + if case_text: + case_text = case_text.replace('', "") + text_source = content_type + break + + if not case_text: + msg = f"Case text not found for cluster {id}" + raise Exception(msg) + + if text_source == "xml_harvard": + case_text = CourtListener.prepare_case_html(cluster, case_text) + + cluster["html_info"]["source_field"] = text_source + additional_metadata = (CourtListener.get_additional_cluster_metadata(id))["results"][0] except requests.exceptions.HTTPError as e: msg = f"Failed call to {resp.request.url}: {e}\n{resp.content}" raise APICommunicationError(msg) - body = opinion["html"] + citations = [ + f"{x.get('volume')} {x.get('reporter')} {x.get('page')}" for x in cluster["citations"] + ] + + # https://www.courtlistener.com/help/api/rest/#case-names + case_name = cluster["case_name"] or cluster["case_name_full"][:10000] + cluster["court"] = {"name": additional_metadata.get("court")} + cluster["docket_number"] = additional_metadata.get("docketNumber") + case = LegalDocument( source=legal_doc_source, - short_name=cluster["case_name"], - name=cluster["case_name"], + short_name=cluster.get("case_name"), + name=case_name, doc_class="Case", - citations=cluster["citations"], - jurisdiction="", - effective_date=cluster["date_filed"], - publication_date=cluster["date_filed"], + citations=citations, + jurisdiction=cluster.get("court_id"), + effective_date=parser.parse(cluster.get("date_filed")), + publication_date=parser.parse(cluster.get("date_modified")), updated_date=datetime.now(), source_ref=str(id), - content=body, - metadata=None, + content=case_text, + metadata=cluster, ) + return case @staticmethod def header_template(legal_document): - return "empty_header.html" + return "court_listener_header.html" + + @staticmethod + def get_opinion_body(sub_opinion_url): + opinion_num = int(sub_opinion_url.split("/")[-2]) + resp = requests.get( + f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/opinions/{opinion_num}/", + headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"}, + ) + + resp.raise_for_status() + return resp.json() + + @staticmethod + def prepare_case_html(cluster, opinions_xml): + xml_declaration = ( + "\n" + ) + case_xml = f"{xml_declaration}\n{cluster['headmatter']}\n{opinions_xml}" + # 'mismatched br tag' and 'invalid attribute https:' error workarounds + case_xml = case_xml.replace("
", "").replace('https:=""', "") + + try: + converted_case_html = xml_to_html(case_xml, str(cluster["id"])) + except Exception as e: + msg = f"Error converting xml to html for case {cluster['id']}: {e}" + raise Exception(msg) + + return converted_case_html + + @staticmethod + def get_search_params(search_params): + search_type_param = ( + {"citation": search_params.q} + if looks_like_citation(search_params.q) + else {"q": search_params.q} + ) + search_params = { + "filed_after": search_params.after_date, + "filed_before": search_params.before_date, + "court": search_params.jurisdiction, + } + params = {**search_type_param, **search_params} + return {k: params[k] for k in params.keys() if params[k] is not None} + + @staticmethod + def get_additional_cluster_metadata(cluster_id): + """ + Additional metadata about a cluster such as court and docket number are available in search endpoint + Instead of clusters endpoint + """ + params = {"q": f"cluster_id:{cluster_id}"} + + resp = requests.get( + f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/search", + params, + headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"}, + ) + + resp.raise_for_status() + return resp.json() class LegacyNoSearch: diff --git a/web/main/templates/export/as_printable_html/node.html b/web/main/templates/export/as_printable_html/node.html index 2b2295596..e433cb750 100644 --- a/web/main/templates/export/as_printable_html/node.html +++ b/web/main/templates/export/as_printable_html/node.html @@ -57,7 +57,11 @@

{{ node.subtitle }}

{% if node.resource_type.lower == 'legaldocument' %} {% if node.resource.doc_class.lower == 'case' %} - {% include "includes/legal_doc_sources/cap_header.html" with legal_doc=node.resource %} + {% if node.resource.metadata.html_info.source == 'cap' %} + {% include "includes/legal_doc_sources/cap_header.html" with legal_doc=node.resource %} + {% elif node.resource.metadata.html_info.source == 'court listener' %} + {% include "includes/legal_doc_sources/court_listener_header.html" with legal_doc=node.resource %} + {% endif %} {% elif node.resource.doc_class.lower == 'code' %} {% include "includes/legal_doc_sources/gpo_header.html" with legal_doc=node.resource %} {% endif %} diff --git a/web/main/templates/includes/legal_doc_sources/cap_header.html b/web/main/templates/includes/legal_doc_sources/cap_header.html index 34d35713f..ee1e908e4 100644 --- a/web/main/templates/includes/legal_doc_sources/cap_header.html +++ b/web/main/templates/includes/legal_doc_sources/cap_header.html @@ -1,8 +1,7 @@