Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add first draft of CL search #2051

Merged
merged 20 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions web/main/case_xml_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""
Convert between XML and HTML versions of CAP's formatted case data.
"""

import lxml.sax
import lxml.html
import xml.sax

from lxml import etree

# sax functions passed to render_sax_tags
sax_start = lxml.sax.ElementTreeContentHandler.startElement
sax_end = lxml.sax.ElementTreeContentHandler.endElement
sax_chars = lxml.sax.ElementTreeContentHandler.characters

mapping = {
"casebody": "section",
"parties": "h4",
"docketnumber": "p",
"court": "p",
"decisiondate": "p",
"otherdate": "p",
"attorneys": "p",
"opinion": "article",
"author": "p",
"page-number": "a",
"extracted-citation": "a",
"bracketnum": "a",
"footnotemark": "a",
}


def render_sax_tags(tag_stack):
# run all of our commands, like "sax_start(*args)", to actually build the xml tree
handler = lxml.sax.ElementTreeContentHandler()
for method, args in tag_stack:
method(handler, *args)
return handler._root


class XmlToHtmlHandler(xml.sax.ContentHandler):
def __init__(self, case_id):
self.tag_stack = []
self.case_id = case_id
self.head_matter_open = False

def startElement(self, name, attrs):

if name == "casebody":
self.tag_stack.append(
(
sax_start,
(
"section",
{
"class": "casebody",
"data-case-id": self.case_id,
"data-firstpage": attrs["firstpage"],
"data-lastpage": attrs["lastpage"],
},
),
)
)
self.tag_stack.append((sax_chars, ("\n ",)))
self.tag_stack.append((sax_start, ("section", {"class": "head-matter"})))
self.head_matter_open = True
elif name == "opinion":
if self.head_matter_open:
self.close_head_matter()
# set opinion type to 'none' for opinions that don't have 'type' in source xml
attr_type = attrs.get("type", "none")
self.tag_stack.append(
(sax_start, ("article", {"class": "opinion", "data-type": attr_type}))
)
elif name == "page-number":
label = attrs["label"]
self.tag_stack.append(
(
sax_start,
(
"a",
{
"id": "p" + label,
"href": f"#p{label}",
"data-label": label,
"data-citation-index": attrs["citation-index"],
"class": "page-label",
},
),
)
)
elif name == "extracted-citation":
new_attrs = {"href": attrs["url"], "class": "citation", "data-index": attrs["index"]}
if "case-ids" in attrs:
new_attrs["data-case-ids"] = attrs["case-ids"]
self.tag_stack.append((sax_start, ("a", new_attrs)))
elif name in ("footnotemark", "bracketnum"):
new_attrs = {"class": name}
if "href" in attrs:
new_attrs["href"] = attrs["href"]
if "id" in attrs:
new_attrs["id"] = attrs["id"]
self.tag_stack.append((sax_start, ("a", new_attrs)))
elif name in (
"parties",
"docketnumber",
"court",
"decisiondate",
"otherdate",
"attorneys",
"author",
"p",
"blockquote",
):
# content element
# set id to 'none' for elements that don't have 'id' in source xml
attrs_id = attrs.get("id", "none")
attrs = {"id": attrs_id}
if "data-blocks" in attrs:
attrs["data-blocks"] = attrs["data-blocks"]
if name not in ("p", "blockquote"):
attrs["class"] = name
new_name = "h4" if name == "parties" else "blockquote" if name == "blockquote" else "p"
if self.head_matter_open:
self.tag_stack.append((sax_chars, (" ",)))
self.tag_stack.append((sax_start, (new_name, attrs)))
else:
# passthrough
self.tag_stack.append((sax_start, (name, attrs)))

def characters(self, text):
if self.head_matter_open and text == " ":
text = " "
self.tag_stack.append((sax_chars, (text,)))

def endElement(self, name):
if name == "casebody" and self.head_matter_open:
self.close_head_matter()
self.tag_stack.append((sax_end, (mapping.get(name, name),)))

def close_head_matter(self):
self.tag_stack.append((sax_end, ("section",)))
self.tag_stack.append((sax_chars, ("\n ",)))
self.head_matter_open = False


def xml_to_html(input, case_id):
handler = XmlToHtmlHandler(case_id)
xml.sax.parseString(input, handler)
tree = render_sax_tags(handler.tag_stack)
return etree.tostring(tree, encoding=str, method="html")
160 changes: 129 additions & 31 deletions web/main/legal_document_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,13 @@
from django.conf import settings
from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
from pyquery import PyQuery
from main.case_xml_converter import xml_to_html

from main.utils import APICommunicationError, looks_like_case_law_link, looks_like_citation
from main.utils import (
APICommunicationError,
looks_like_case_law_link,
looks_like_citation,
)

vs_check = re.compile(" [vV][sS]?[.]? ")

Expand Down Expand Up @@ -519,8 +524,8 @@ def header_template(legal_document):
class CourtListener:
details = {
"name": "CourtListener",
"short_description": "hello",
"long_description": "CourtListener searches millions of opinions across hundreds of jurisdictions",
"short_description": "CourtListener contains millions of legal opinions.",
"long_description": "CourtListener searches millions of opinions across hundreds of jurisdictions.",
"link": settings.COURTLISTENER_BASE_URL,
"search_regexes": [],
"footnote_regexes": [],
Expand All @@ -532,11 +537,7 @@ def search(search_params):
if not settings.COURTLISTENER_API_KEY:
raise APICommunicationError("A CourtListener API key is required")
try:
params = (
{"citation": search_params.q}
if looks_like_citation(search_params.q)
else {"q": search_params.q}
)
params = CourtListener.get_search_params(search_params)
resp = requests.get(
f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/search",
params,
Expand All @@ -552,13 +553,16 @@ def search(search_params):
results.append(
{
"fullName": r["caseName"],
"shortName": r["caseName"],
"fullCitations": ", ".join(r["citation"]),
"shortCitations": ", ".join(r["citation"][:3])
+ ("..." if len(r["citation"]) > 3 else ""),
"effectiveDate": parser.isoparse(r["dateFiled"]).strftime("%Y-%m-%d"),
"shortName": truncate_name(r["caseName"]),
"fullCitations": ", ".join(r["citation"]) if r["citation"] else "",
"shortCitations": (
", ".join(r["citation"][:3]) + ("..." if len(r["citation"]) > 3 else "")
if r["citation"]
else ""
),
"effectiveDate": parser.isoparse(r["dateFiled"][:25]).strftime("%Y-%m-%d"),
"url": f"{settings.COURTLISTENER_BASE_URL}{r['absolute_url']}",
"id": r["id"],
"id": r["cluster_id"],
}
)
return results
Expand All @@ -576,38 +580,132 @@ def pull(legal_doc_source, id):
)
resp.raise_for_status()
cluster = resp.json()
resp = requests.get(
f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/opinions/{id}/",
headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"},
)
resp.raise_for_status()

opinion = resp.json()
cluster["html_info"] = {"source": "court listener"}
cluster["sub_opinions"].sort(key=lambda url: int(url.split("/")[-2]))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!


sub_opinion_jsons = []
for opinion in cluster["sub_opinions"]:
sub_opinion_jsons.append(CourtListener.get_opinion_body(opinion))

text_source = ""
for content_type in (
"xml_harvard",
"html_with_citations",
"html_columbia",
"html_lawbox",
"html_anon_2020",
"html",
"plain_text",
):
case_text = "".join(sub_opinion[content_type] for sub_opinion in sub_opinion_jsons)
if case_text:
case_text = case_text.replace('<?xml version="1.0" encoding="utf-8"?>', "")
text_source = content_type
break

if not case_text:
msg = f"Case text not found for cluster {id}"
raise Exception(msg)

if text_source == "xml_harvard":
case_text = CourtListener.prepare_case_html(cluster, case_text)

cluster["html_info"]["source_field"] = text_source
additional_metadata = (CourtListener.get_additional_cluster_metadata(id))["results"][0]

except requests.exceptions.HTTPError as e:
msg = f"Failed call to {resp.request.url}: {e}\n{resp.content}"
raise APICommunicationError(msg)

body = opinion["html"]
citations = [
f"{x.get('volume')} {x.get('reporter')} {x.get('page')}" for x in cluster["citations"]
]

# https://www.courtlistener.com/help/api/rest/#case-names
case_name = cluster["case_name"] or cluster["case_name_full"][:10000]
cluster["court"] = {"name": additional_metadata.get("court")}
cluster["docket_number"] = additional_metadata.get("docketNumber")

case = LegalDocument(
source=legal_doc_source,
short_name=cluster["case_name"],
name=cluster["case_name"],
short_name=cluster.get("case_name"),
name=case_name,
doc_class="Case",
citations=cluster["citations"],
jurisdiction="",
effective_date=cluster["date_filed"],
publication_date=cluster["date_filed"],
citations=citations,
jurisdiction=cluster.get("court_id"),
effective_date=parser.parse(cluster.get("date_filed")),
publication_date=parser.parse(cluster.get("date_modified")),
updated_date=datetime.now(),
source_ref=str(id),
content=body,
metadata=None,
content=case_text,
metadata=cluster,
)

return case

@staticmethod
def header_template(legal_document):
return "empty_header.html"
return "court_listener_header.html"

@staticmethod
def get_opinion_body(sub_opinion_url):
opinion_num = int(sub_opinion_url.split("/")[-2])
resp = requests.get(
f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/opinions/{opinion_num}/",
headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"},
)

resp.raise_for_status()
return resp.json()

@staticmethod
def prepare_case_html(cluster, opinions_xml):
xml_declaration = (
"<?xml version='1.0' encoding='utf-8'?>\n<casebody firstpage='0' lastpage='0'>"
)
case_xml = f"{xml_declaration}\n{cluster['headmatter']}\n{opinions_xml}</casebody>"
# 'mismatched br tag' and 'invalid attribute https:' error workarounds
case_xml = case_xml.replace("<br>", "").replace('https:=""', "")

try:
converted_case_html = xml_to_html(case_xml, str(cluster["id"]))
except Exception as e:
msg = f"Error converting xml to html for case {cluster['id']}: {e}"
raise Exception(msg)

return converted_case_html

@staticmethod
def get_search_params(search_params):
search_type_param = (
{"citation": search_params.q}
if looks_like_citation(search_params.q)
else {"q": search_params.q}
)
search_params = {
"filed_after": search_params.after_date,
"filed_before": search_params.before_date,
"court": search_params.jurisdiction,
}
params = {**search_type_param, **search_params}
return {k: params[k] for k in params.keys() if params[k] is not None}

@staticmethod
def get_additional_cluster_metadata(cluster_id):
"""
Additional metadata about a cluster such as court and docket number are available in search endpoint
Instead of clusters endpoint
"""
params = {"q": f"cluster_id:{cluster_id}"}

resp = requests.get(
f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/search",
params,
headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"},
)

resp.raise_for_status()
return resp.json()


class LegacyNoSearch:
Expand Down
6 changes: 5 additions & 1 deletion web/main/templates/export/as_printable_html/node.html
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,11 @@ <h2 class="subtitle">{{ node.subtitle }}</h2>

{% if node.resource_type.lower == 'legaldocument' %}
{% if node.resource.doc_class.lower == 'case' %}
{% include "includes/legal_doc_sources/cap_header.html" with legal_doc=node.resource %}
{% if node.resource.metadata.html_info.source == 'cap' %}
{% include "includes/legal_doc_sources/cap_header.html" with legal_doc=node.resource %}
{% elif node.resource.metadata.html_info.source == 'court listener' %}
{% include "includes/legal_doc_sources/court_listener_header.html" with legal_doc=node.resource %}
{% endif %}
{% elif node.resource.doc_class.lower == 'code' %}
{% include "includes/legal_doc_sources/gpo_header.html" with legal_doc=node.resource %}
{% endif %}
Expand Down
5 changes: 2 additions & 3 deletions web/main/templates/includes/legal_doc_sources/cap_header.html
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
<header class="case-header legal-doc-header" >
{% with md=legal_doc.metadata %}
<div class="court" data-custom-style="Case Header"> {{ md.court.name }}</div>
<div class="title" data-custom-style="Case Header">{{ legal_doc.get_title }}</div>
<div class="citation" data-custom-style="Case Header"> {{legal_doc.cite_string}} </div>
<div class="court" data-custom-style="Case Header">{{ md.court.name }}</div>
<div class="citation" data-custom-style="Case Header">{{ legal_doc.cite_string }}</div>
{% if md.docket_number %}<div class="docketnumber" data-custom-style="Case Header">{{ md.docket_number }}</div>{% endif %}
{% if md.decision_date %}<div class="decisiondate" data-custom-style="Case Header">{{ md.decision_date }}</div>{% endif %}
{% endwith %}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<header class="case-header legal-doc-header" >
{% with md=legal_doc.metadata %}
<div class="court" data-custom-style="Case Header">{{ md.court.name }}</div>
<div class="citation" data-custom-style="Case Header">{{ legal_doc.cite_string }}</div>
{% if md.docket_number %}<div class="docketnumber" data-custom-style="Case Header">{{ md.docket_number }}</div>{% endif %}
{% if md.date_filed %}<div class="decisiondate" data-custom-style="Case Header">{{ md.date_filed }}</div>{% endif %}
{% endwith %}
</header>
Loading