Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add first draft of CL search #2051

Merged
merged 20 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions web/main/case_xml_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""
Convert between XML and HTML versions of CAP's formatted case data.
"""

import lxml.sax
import lxml.html
import xml.sax

from lxml import etree

# sax functions passed to render_sax_tags
sax_start = lxml.sax.ElementTreeContentHandler.startElement
sax_end = lxml.sax.ElementTreeContentHandler.endElement
sax_chars = lxml.sax.ElementTreeContentHandler.characters

mapping = {
"casebody": "section",
"parties": "h4",
"docketnumber": "p",
"court": "p",
"decisiondate": "p",
"otherdate": "p",
"attorneys": "p",
"opinion": "article",
"author": "p",
"page-number": "a",
"extracted-citation": "a",
"bracketnum": "a",
"footnotemark": "a",
}


def render_sax_tags(tag_stack):
# run all of our commands, like "sax_start(*args)", to actually build the xml tree
handler = lxml.sax.ElementTreeContentHandler()
for method, args in tag_stack:
method(handler, *args)
return handler._root


class XmlToHtmlHandler(xml.sax.ContentHandler):
def __init__(self, case_id):
self.tag_stack = []
self.case_id = case_id
self.head_matter_open = False

def startElement(self, name, attrs):

if name == "casebody":
self.tag_stack.append(
(
sax_start,
(
"section",
{
"class": "casebody",
"data-case-id": self.case_id,
"data-firstpage": attrs["firstpage"],
"data-lastpage": attrs["lastpage"],
},
),
)
)
self.tag_stack.append((sax_chars, ("\n ",)))
self.tag_stack.append((sax_start, ("section", {"class": "head-matter"})))
self.head_matter_open = True
elif name == "opinion":
if self.head_matter_open:
self.close_head_matter()
# set opinion type to 'none' for opinions that don't have 'type' in source xml
attr_type = attrs.get("type", "none")
self.tag_stack.append(
(sax_start, ("article", {"class": "opinion", "data-type": attr_type}))
)
elif name == "page-number":
label = attrs["label"]
self.tag_stack.append(
(
sax_start,
(
"a",
{
"id": "p" + label,
"href": f"#p{label}",
"data-label": label,
"data-citation-index": attrs["citation-index"],
"class": "page-label",
},
),
)
)
elif name == "extracted-citation":
new_attrs = {"href": attrs["url"], "class": "citation", "data-index": attrs["index"]}
if "case-ids" in attrs:
new_attrs["data-case-ids"] = attrs["case-ids"]
self.tag_stack.append((sax_start, ("a", new_attrs)))
elif name in ("footnotemark", "bracketnum"):
new_attrs = {"class": name}
if "href" in attrs:
new_attrs["href"] = attrs["href"]
if "id" in attrs:
new_attrs["id"] = attrs["id"]
self.tag_stack.append((sax_start, ("a", new_attrs)))
elif name in (
"parties",
"docketnumber",
"court",
"decisiondate",
"otherdate",
"attorneys",
"author",
"p",
"blockquote",
):
# content element
# set id to 'none' for elements that don't have 'id' in source xml
attrs_id = attrs.get("id", "none")
attrs = {"id": attrs_id}
if "data-blocks" in attrs:
attrs["data-blocks"] = attrs["data-blocks"]
if name not in ("p", "blockquote"):
attrs["class"] = name
new_name = "h4" if name == "parties" else "blockquote" if name == "blockquote" else "p"
if self.head_matter_open:
self.tag_stack.append((sax_chars, (" ",)))
self.tag_stack.append((sax_start, (new_name, attrs)))
else:
# passthrough
self.tag_stack.append((sax_start, (name, attrs)))

def characters(self, text):
if self.head_matter_open and text == " ":
text = " "
self.tag_stack.append((sax_chars, (text,)))

def endElement(self, name):
if name == "casebody" and self.head_matter_open:
self.close_head_matter()
self.tag_stack.append((sax_end, (mapping.get(name, name),)))

def close_head_matter(self):
self.tag_stack.append((sax_end, ("section",)))
self.tag_stack.append((sax_chars, ("\n ",)))
self.head_matter_open = False


def xml_to_html(input, case_id):
handler = XmlToHtmlHandler(case_id)
xml.sax.parseString(input, handler)
tree = render_sax_tags(handler.tag_stack)
return etree.tostring(tree, encoding=str, method="html")
123 changes: 94 additions & 29 deletions web/main/legal_document_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,13 @@
from django.conf import settings
from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
from pyquery import PyQuery
from .case_xml_converter import xml_to_html
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't really matter, but FWIW: I think this project's convention is to stick with absolute imports (e.g. from main.case_xml_converter import xml_to_html, as with from main.utils import ...) rather than relative imports like from .case_xml_converter. But yeah, totally doesn't matter.


from main.utils import APICommunicationError, looks_like_case_law_link, looks_like_citation
from main.utils import (
APICommunicationError,
looks_like_case_law_link,
looks_like_citation,
)

vs_check = re.compile(" [vV][sS]?[.]? ")

Expand Down Expand Up @@ -519,8 +524,8 @@ def header_template(legal_document):
class CourtListener:
details = {
"name": "CourtListener",
"short_description": "hello",
"long_description": "CourtListener searches millions of opinions across hundreds of jurisdictions",
"short_description": "CourtListener contains millions of legal opinions.",
"long_description": "CourtListener searches millions of opinions across hundreds of jurisdictions.",
"link": settings.COURTLISTENER_BASE_URL,
"search_regexes": [],
"footnote_regexes": [],
Expand All @@ -532,11 +537,7 @@ def search(search_params):
if not settings.COURTLISTENER_API_KEY:
raise APICommunicationError("A CourtListener API key is required")
try:
params = (
{"citation": search_params.q}
if looks_like_citation(search_params.q)
else {"q": search_params.q}
)
params = CourtListener.cl_params(search_params)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tiny suggestion: what would you think of renaming CourtListener.cl_params to CourtListener.format_search_params or CourtListener.get_search_params?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

most definitely, I will update

resp = requests.get(
f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/search",
params,
Expand All @@ -552,13 +553,16 @@ def search(search_params):
results.append(
{
"fullName": r["caseName"],
"shortName": r["caseName"],
"fullCitations": ", ".join(r["citation"]),
"shortCitations": ", ".join(r["citation"][:3])
+ ("..." if len(r["citation"]) > 3 else ""),
"effectiveDate": parser.isoparse(r["dateFiled"]).strftime("%Y-%m-%d"),
"shortName": truncate_name(r["caseName"]),
"fullCitations": ", ".join(r["citation"]) if r["citation"] else "",
"shortCitations": (
", ".join(r["citation"][:3]) + ("..." if len(r["citation"]) > 3 else "")
if r["citation"]
else ""
),
"effectiveDate": parser.isoparse(r["dateFiled"][:25]).strftime("%Y-%m-%d"),
"url": f"{settings.COURTLISTENER_BASE_URL}{r['absolute_url']}",
"id": r["id"],
"id": r["cluster_id"],
}
)
return results
Expand All @@ -576,39 +580,100 @@ def pull(legal_doc_source, id):
)
resp.raise_for_status()
cluster = resp.json()
resp = requests.get(
f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/opinions/{id}/",
headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"},
)
resp.raise_for_status()

opinion = resp.json()
if cluster["filepath_json_harvard"]:
harvard_xml_data = ""
for sub_opinion in cluster["sub_opinions"]:
opinion = CourtListener.get_opinion_body(sub_opinion)
if opinion["xml_harvard"]:
opinion_xml = opinion["xml_harvard"].replace(
'<?xml version="1.0" encoding="utf-8"?>', ""
)
harvard_xml_data += f"{opinion_xml}\n"
case_html = CourtListener.prepare_case_html(cluster, harvard_xml_data)
else:
opinion = CourtListener.get_opinion_body(cluster["sub_opinions"][0])
case_html = opinion["html"] if opinion["html"] else opinion["plain_text"]
Copy link
Contributor

@jcushman jcushman Jun 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The logic here wants to be something like "use all xml_harvard if they exist, else all html if they exist, else all plain_text." So what if you do something like this?

Suggested change
if cluster["filepath_json_harvard"]:
harvard_xml_data = ""
for sub_opinion in cluster["sub_opinions"]:
opinion = CourtListener.get_opinion_body(sub_opinion)
if opinion["xml_harvard"]:
opinion_xml = opinion["xml_harvard"].replace(
'<?xml version="1.0" encoding="utf-8"?>', ""
)
harvard_xml_data += f"{opinion_xml}\n"
case_html = CourtListener.prepare_case_html(cluster, harvard_xml_data)
else:
opinion = CourtListener.get_opinion_body(cluster["sub_opinions"][0])
case_html = opinion["html"] if opinion["html"] else opinion["plain_text"]
for cl_type in ('xml_harvard', 'html', 'plain_text'):
case_text = ''.join(sub_opinion[cl_type] for sub_opinion in cluster['sub_opinions'])
if case_text:
break
else:
# failed to find anything ...
# do stuff based on cl_type and case_text ...

I think this will also address your question

I saw that some opinions don't have either of the content fields (xml_harvard, plain_text). Think about what to do in this case. Can we fall back on other html fields? Or disable importing for those documents?

Are there clusters where none of the opinions have any of those fields? I think it might be just that some opinions have some of the fields and others have other fields.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I remember running into this scenario where neither of those fields were populated for an opinion, but darn, it looks like I didn't save its id. Also there is some info here about the available text fields, but it doesn't mention whether any of them will always be populated.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's an example where the existing logic doesn't work: https://www.courtlistener.com/api/rest/v3/clusters/86480/

This has the full opinion text in the first opinion as "html", and then the full opinion text split into three parts in "xml_harvard." Checking type by type instead of opinion by opinion will handle this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Jack, can you clarify the checking type by type a instead of opinion by opinion piece? To be able to see the content type ('xml_harvard', 'html', 'plain_text' etc) of the opinion, I still need to query the sub-opinion first as there is not an indicator on the cluster level.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My big thought here is that it is never correct to assemble multiple types -- op[0]['plain_text'] + op[1]['html'] is always wrong. Logically what we're trying to do is find the chosen type and then glue it together. So I think this algorithm will turn out to be more robust against weird edge cases, and easy to get correct, if the logic is:

  • fetch all subopinions
  • from most to least preferred type, concatenate that type together from all subopinions. when you find one that isn't empty when concatenated, break, that's the chosen type.
  • process the concatenation as appropriate to the type

That's what I was trying to gesture at with my code sketch, though I skipped the step of pre-fetching all the subopinions.

By the way! I noticed that opinions aren't always sorted in the correct order: https://www.courtlistener.com/api/rest/v3/clusters/3390160/ . From CL slack it sounds like the best thing for now is to sort them by ID.

Copy link
Contributor Author

@teovin teovin Jun 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I implemented this and also added the sorting. My initial thought was to prevent querying the other opinions if the source was Harvard. Also, I had chosen the opinion with index 0 in the otherwise block because all of the clusters that weren't from Harvard had only 1 opinion (those that I used in testing). But it makes sense to account for the existence of multiples since I didn't check all of the existing clusters. Let me know how it looks now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, avoiding extra queries totally makes sense! I ended up seeing enough variation in the CL API that I like the idea of being more robust to edge cases, as well as having a clean way to add stuff later like "turns out we prefer the html_columbia field but it needs special processing." Thanks for switching it around.


except requests.exceptions.HTTPError as e:
msg = f"Failed call to {resp.request.url}: {e}\n{resp.content}"
raise APICommunicationError(msg)

body = opinion["html"]
citations = [
f"{x.get('volume')} {x.get('reporter')} {x.get('page')}" for x in cluster["citations"]
]
cluster["html_info"] = {"source": "court listener"}

# https://www.courtlistener.com/help/api/rest/#case-names
case_name = ""
if cluster["case_name"]:
case_name = cluster["case_name"]
elif cluster["case_name_full"]:
case_name = cluster["case_name_full"][:10000]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Style note, I find this clearer as case_name = cluster["case_name"] or cluster["case_name_full"][:10000]


case = LegalDocument(
source=legal_doc_source,
short_name=cluster["case_name"],
name=cluster["case_name"],
short_name=cluster.get("case_name"),
name=case_name,
doc_class="Case",
citations=cluster["citations"],
jurisdiction="",
effective_date=cluster["date_filed"],
publication_date=cluster["date_filed"],
citations=citations,
jurisdiction=cluster.get("court_id"),
effective_date=parser.parse(cluster.get("date_filed")),
publication_date=parser.parse(cluster.get("date_modified")),
updated_date=datetime.now(),
source_ref=str(id),
content=body,
metadata=None,
content=case_html,
metadata=cluster,
)
return case

@staticmethod
def header_template(legal_document):
return "empty_header.html"

@staticmethod
def get_opinion_body(sub_opinion_url):
opinion_num = int(sub_opinion_url.split("/")[-2])
resp = requests.get(
f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/opinions/{opinion_num}/",
headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"},
)

resp.raise_for_status()
return resp.json()

@staticmethod
def prepare_case_html(cluster, opinions_xml):
xml_declaration = (
"<?xml version='1.0' encoding='utf-8'?>\n<casebody firstpage='0' lastpage='0'>"
)
case_xml = f"{xml_declaration}\n{cluster['headmatter']}\n{opinions_xml}</casebody>"
# 'mismatched br tag' and 'invalid attribute https:' error workarounds
case_xml = case_xml.replace("<br>", "").replace('https:=""', "")

try:
converted_case_html = xml_to_html(case_xml, str(cluster["id"]))
except Exception as e:
msg = f"Error converting xml to html for case {cluster['id']}: {e}"
raise Exception(msg)

return converted_case_html

@staticmethod
def cl_params(search_params):
search_type_param = (
{"citation": search_params.q}
if looks_like_citation(search_params.q)
else {"q": search_params.q}
)
search_params = {
"filed_after": search_params.after_date,
"filed_before": search_params.before_date,
"court": search_params.jurisdiction,
}
params = {**search_type_param, **search_params}
return {k: params[k] for k in params.keys() if params[k] is not None}


class LegacyNoSearch:
details = {
Expand Down
Loading