-
Notifications
You must be signed in to change notification settings - Fork 30
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add first draft of CL search #2051
Changes from 19 commits
e97acd6
42575a5
776bb60
a040698
c22eadc
1d7be58
041007a
7bc82ff
1a1f7d2
5aaf88c
1cb9e47
09d79c0
7c20b0a
ef85d09
4393e2e
c26f0a9
f522cf2
6562520
6afd2c2
e05079e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
""" | ||
Convert between XML and HTML versions of CAP's formatted case data. | ||
""" | ||
|
||
import lxml.sax | ||
import lxml.html | ||
import xml.sax | ||
|
||
from lxml import etree | ||
|
||
# sax functions passed to render_sax_tags | ||
sax_start = lxml.sax.ElementTreeContentHandler.startElement | ||
sax_end = lxml.sax.ElementTreeContentHandler.endElement | ||
sax_chars = lxml.sax.ElementTreeContentHandler.characters | ||
|
||
mapping = { | ||
"casebody": "section", | ||
"parties": "h4", | ||
"docketnumber": "p", | ||
"court": "p", | ||
"decisiondate": "p", | ||
"otherdate": "p", | ||
"attorneys": "p", | ||
"opinion": "article", | ||
"author": "p", | ||
"page-number": "a", | ||
"extracted-citation": "a", | ||
"bracketnum": "a", | ||
"footnotemark": "a", | ||
} | ||
|
||
|
||
def render_sax_tags(tag_stack): | ||
# run all of our commands, like "sax_start(*args)", to actually build the xml tree | ||
handler = lxml.sax.ElementTreeContentHandler() | ||
for method, args in tag_stack: | ||
method(handler, *args) | ||
return handler._root | ||
|
||
|
||
class XmlToHtmlHandler(xml.sax.ContentHandler): | ||
def __init__(self, case_id): | ||
self.tag_stack = [] | ||
self.case_id = case_id | ||
self.head_matter_open = False | ||
|
||
def startElement(self, name, attrs): | ||
|
||
if name == "casebody": | ||
self.tag_stack.append( | ||
( | ||
sax_start, | ||
( | ||
"section", | ||
{ | ||
"class": "casebody", | ||
"data-case-id": self.case_id, | ||
"data-firstpage": attrs["firstpage"], | ||
"data-lastpage": attrs["lastpage"], | ||
}, | ||
), | ||
) | ||
) | ||
self.tag_stack.append((sax_chars, ("\n ",))) | ||
self.tag_stack.append((sax_start, ("section", {"class": "head-matter"}))) | ||
self.head_matter_open = True | ||
elif name == "opinion": | ||
if self.head_matter_open: | ||
self.close_head_matter() | ||
# set opinion type to 'none' for opinions that don't have 'type' in source xml | ||
attr_type = attrs.get("type", "none") | ||
self.tag_stack.append( | ||
(sax_start, ("article", {"class": "opinion", "data-type": attr_type})) | ||
) | ||
elif name == "page-number": | ||
label = attrs["label"] | ||
self.tag_stack.append( | ||
( | ||
sax_start, | ||
( | ||
"a", | ||
{ | ||
"id": "p" + label, | ||
"href": f"#p{label}", | ||
"data-label": label, | ||
"data-citation-index": attrs["citation-index"], | ||
"class": "page-label", | ||
}, | ||
), | ||
) | ||
) | ||
elif name == "extracted-citation": | ||
new_attrs = {"href": attrs["url"], "class": "citation", "data-index": attrs["index"]} | ||
if "case-ids" in attrs: | ||
new_attrs["data-case-ids"] = attrs["case-ids"] | ||
self.tag_stack.append((sax_start, ("a", new_attrs))) | ||
elif name in ("footnotemark", "bracketnum"): | ||
new_attrs = {"class": name} | ||
if "href" in attrs: | ||
new_attrs["href"] = attrs["href"] | ||
if "id" in attrs: | ||
new_attrs["id"] = attrs["id"] | ||
self.tag_stack.append((sax_start, ("a", new_attrs))) | ||
elif name in ( | ||
"parties", | ||
"docketnumber", | ||
"court", | ||
"decisiondate", | ||
"otherdate", | ||
"attorneys", | ||
"author", | ||
"p", | ||
"blockquote", | ||
): | ||
# content element | ||
# set id to 'none' for elements that don't have 'id' in source xml | ||
attrs_id = attrs.get("id", "none") | ||
attrs = {"id": attrs_id} | ||
if "data-blocks" in attrs: | ||
attrs["data-blocks"] = attrs["data-blocks"] | ||
if name not in ("p", "blockquote"): | ||
attrs["class"] = name | ||
new_name = "h4" if name == "parties" else "blockquote" if name == "blockquote" else "p" | ||
if self.head_matter_open: | ||
self.tag_stack.append((sax_chars, (" ",))) | ||
self.tag_stack.append((sax_start, (new_name, attrs))) | ||
else: | ||
# passthrough | ||
self.tag_stack.append((sax_start, (name, attrs))) | ||
|
||
def characters(self, text): | ||
if self.head_matter_open and text == " ": | ||
text = " " | ||
self.tag_stack.append((sax_chars, (text,))) | ||
|
||
def endElement(self, name): | ||
if name == "casebody" and self.head_matter_open: | ||
self.close_head_matter() | ||
self.tag_stack.append((sax_end, (mapping.get(name, name),))) | ||
|
||
def close_head_matter(self): | ||
self.tag_stack.append((sax_end, ("section",))) | ||
self.tag_stack.append((sax_chars, ("\n ",))) | ||
self.head_matter_open = False | ||
|
||
|
||
def xml_to_html(input, case_id): | ||
handler = XmlToHtmlHandler(case_id) | ||
xml.sax.parseString(input, handler) | ||
tree = render_sax_tags(handler.tag_stack) | ||
return etree.tostring(tree, encoding=str, method="html") |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,8 +13,13 @@ | |
from django.conf import settings | ||
from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector | ||
from pyquery import PyQuery | ||
from main.case_xml_converter import xml_to_html | ||
|
||
from main.utils import APICommunicationError, looks_like_case_law_link, looks_like_citation | ||
from main.utils import ( | ||
APICommunicationError, | ||
looks_like_case_law_link, | ||
looks_like_citation, | ||
) | ||
|
||
vs_check = re.compile(" [vV][sS]?[.]? ") | ||
|
||
|
@@ -519,8 +524,8 @@ def header_template(legal_document): | |
class CourtListener: | ||
details = { | ||
"name": "CourtListener", | ||
"short_description": "hello", | ||
"long_description": "CourtListener searches millions of opinions across hundreds of jurisdictions", | ||
"short_description": "CourtListener contains millions of legal opinions.", | ||
"long_description": "CourtListener searches millions of opinions across hundreds of jurisdictions.", | ||
"link": settings.COURTLISTENER_BASE_URL, | ||
"search_regexes": [], | ||
"footnote_regexes": [], | ||
|
@@ -532,11 +537,7 @@ def search(search_params): | |
if not settings.COURTLISTENER_API_KEY: | ||
raise APICommunicationError("A CourtListener API key is required") | ||
try: | ||
params = ( | ||
{"citation": search_params.q} | ||
if looks_like_citation(search_params.q) | ||
else {"q": search_params.q} | ||
) | ||
params = CourtListener.get_search_params(search_params) | ||
resp = requests.get( | ||
f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/search", | ||
params, | ||
|
@@ -552,13 +553,16 @@ def search(search_params): | |
results.append( | ||
{ | ||
"fullName": r["caseName"], | ||
"shortName": r["caseName"], | ||
"fullCitations": ", ".join(r["citation"]), | ||
"shortCitations": ", ".join(r["citation"][:3]) | ||
+ ("..." if len(r["citation"]) > 3 else ""), | ||
"effectiveDate": parser.isoparse(r["dateFiled"]).strftime("%Y-%m-%d"), | ||
"shortName": truncate_name(r["caseName"]), | ||
"fullCitations": ", ".join(r["citation"]) if r["citation"] else "", | ||
"shortCitations": ( | ||
", ".join(r["citation"][:3]) + ("..." if len(r["citation"]) > 3 else "") | ||
if r["citation"] | ||
else "" | ||
), | ||
"effectiveDate": parser.isoparse(r["dateFiled"][:25]).strftime("%Y-%m-%d"), | ||
"url": f"{settings.COURTLISTENER_BASE_URL}{r['absolute_url']}", | ||
"id": r["id"], | ||
"id": r["cluster_id"], | ||
} | ||
) | ||
return results | ||
|
@@ -576,38 +580,124 @@ def pull(legal_doc_source, id): | |
) | ||
resp.raise_for_status() | ||
cluster = resp.json() | ||
resp = requests.get( | ||
f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/opinions/{id}/", | ||
headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"}, | ||
) | ||
resp.raise_for_status() | ||
cluster["html_info"] = {"source": "court listener"} | ||
cluster["sub_opinions"].sort(key=lambda url: int(url.split("/")[-2])) | ||
|
||
sub_opinion_jsons = [] | ||
for opinion in cluster["sub_opinions"]: | ||
sub_opinion_jsons.append(CourtListener.get_opinion_body(opinion)) | ||
|
||
text_source = "" | ||
for content_type in ("xml_harvard", "html", "plain_text"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking at https://www.courtlistener.com/help/api/rest/v3/case-law/#opinion-endpoint , I'm realizing that "html" and "plain_text" are the least preferred fields, and html_with_citations is the most preferred field. What if we use this preference:
It looks to me like the citations markup [that they add when creating html_with_citations] is harmless and we could just pass it through. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cool, I just added all text field options in the order CL specifies with the exception of |
||
case_text = "".join(sub_opinion[content_type] for sub_opinion in sub_opinion_jsons) | ||
if case_text: | ||
case_text = case_text.replace('<?xml version="1.0" encoding="utf-8"?>', "") | ||
text_source = content_type | ||
break | ||
|
||
if not case_text: | ||
msg = f"Case text not found for cluster {id}" | ||
raise Exception(msg) | ||
|
||
if text_source == "xml_harvard": | ||
case_text = CourtListener.prepare_case_html(cluster, case_text) | ||
|
||
opinion = resp.json() | ||
cluster["html_info"]["source_field"] = text_source | ||
additional_metadata = (CourtListener.get_additional_cluster_metadata(id))["results"][0] | ||
|
||
except requests.exceptions.HTTPError as e: | ||
msg = f"Failed call to {resp.request.url}: {e}\n{resp.content}" | ||
raise APICommunicationError(msg) | ||
|
||
body = opinion["html"] | ||
citations = [ | ||
f"{x.get('volume')} {x.get('reporter')} {x.get('page')}" for x in cluster["citations"] | ||
] | ||
|
||
# https://www.courtlistener.com/help/api/rest/#case-names | ||
case_name = cluster["case_name"] or cluster["case_name_full"][:10000] | ||
cluster["court"] = {"name": additional_metadata.get("court")} | ||
cluster["docket_number"] = additional_metadata.get("docketNumber") | ||
|
||
case = LegalDocument( | ||
source=legal_doc_source, | ||
short_name=cluster["case_name"], | ||
name=cluster["case_name"], | ||
short_name=cluster.get("case_name"), | ||
name=case_name, | ||
doc_class="Case", | ||
citations=cluster["citations"], | ||
jurisdiction="", | ||
effective_date=cluster["date_filed"], | ||
publication_date=cluster["date_filed"], | ||
citations=citations, | ||
jurisdiction=cluster.get("court_id"), | ||
effective_date=parser.parse(cluster.get("date_filed")), | ||
publication_date=parser.parse(cluster.get("date_modified")), | ||
updated_date=datetime.now(), | ||
source_ref=str(id), | ||
content=body, | ||
metadata=None, | ||
content=case_text, | ||
metadata=cluster, | ||
) | ||
|
||
return case | ||
|
||
@staticmethod | ||
def header_template(legal_document): | ||
return "empty_header.html" | ||
return "court_listener_header.html" | ||
|
||
@staticmethod | ||
def get_opinion_body(sub_opinion_url): | ||
opinion_num = int(sub_opinion_url.split("/")[-2]) | ||
resp = requests.get( | ||
f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/opinions/{opinion_num}/", | ||
headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"}, | ||
) | ||
|
||
resp.raise_for_status() | ||
return resp.json() | ||
|
||
@staticmethod | ||
def prepare_case_html(cluster, opinions_xml): | ||
xml_declaration = ( | ||
"<?xml version='1.0' encoding='utf-8'?>\n<casebody firstpage='0' lastpage='0'>" | ||
) | ||
case_xml = f"{xml_declaration}\n{cluster['headmatter']}\n{opinions_xml}</casebody>" | ||
# 'mismatched br tag' and 'invalid attribute https:' error workarounds | ||
case_xml = case_xml.replace("<br>", "").replace('https:=""', "") | ||
|
||
try: | ||
converted_case_html = xml_to_html(case_xml, str(cluster["id"])) | ||
except Exception as e: | ||
msg = f"Error converting xml to html for case {cluster['id']}: {e}" | ||
raise Exception(msg) | ||
|
||
return converted_case_html | ||
|
||
@staticmethod | ||
def get_search_params(search_params): | ||
search_type_param = ( | ||
{"citation": search_params.q} | ||
if looks_like_citation(search_params.q) | ||
else {"q": search_params.q} | ||
) | ||
search_params = { | ||
"filed_after": search_params.after_date, | ||
"filed_before": search_params.before_date, | ||
"court": search_params.jurisdiction, | ||
} | ||
params = {**search_type_param, **search_params} | ||
return {k: params[k] for k in params.keys() if params[k] is not None} | ||
|
||
@staticmethod | ||
def get_additional_cluster_metadata(cluster_id): | ||
""" | ||
Additional metadata about a cluster such as court and docket number are available in search endpoint | ||
Instead of clusters endpoint | ||
""" | ||
params = {"q": f"cluster_id:{cluster_id}"} | ||
|
||
resp = requests.get( | ||
f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/search", | ||
params, | ||
headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"}, | ||
) | ||
|
||
resp.raise_for_status() | ||
return resp.json() | ||
|
||
|
||
class LegacyNoSearch: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
<header class="case-header legal-doc-header" > | ||
{% with md=legal_doc.metadata %} | ||
<div class="court" data-custom-style="Case Header">{{ md.court.name }}</div> | ||
<div class="citation" data-custom-style="Case Header">{{ legal_doc.cite_string }}</div> | ||
{% if md.docket_number %}<div class="docketnumber" data-custom-style="Case Header">{{ md.docket_number }}</div>{% endif %} | ||
{% if md.date_filed %}<div class="decisiondate" data-custom-style="Case Header">{{ md.date_filed }}</div>{% endif %} | ||
{% endwith %} | ||
</header> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice!