Skip to content

Commit

Permalink
Merge pull request #1258 from freelawproject/fix-colo-ct-app
Browse files Browse the repository at this point in the history
feat(colo): Update colorado
  • Loading branch information
grossir authored Nov 26, 2024
2 parents dcb68ea + 8299bd4 commit ab37154
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 73 deletions.
72 changes: 40 additions & 32 deletions juriscraper/opinions/united_states/state/colo.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ class Site(OpinionSiteLinear):
days_interval = 30
first_opinion_date = datetime(2010, 1, 1)
api_court_code = "14024_01"
label_to_key = {
"Docket Number": "docket",
"Parties": "name",
"Decision Date": "date",
"Citation": "citation",
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -37,7 +43,7 @@ def __init__(self, *args, **kwargs):
"court": self.api_court_code,
"bypass_rabl": "true",
"include": "parent,abstract,snippet,properties_with_ids",
"per_page": "30", # Server breaks down when per_page=500, returns 503
"per_page": "50", # Server breaks down when per_page=500, returns 503
"page": "1",
"sort": "date",
"type": "document",
Expand All @@ -48,6 +54,7 @@ def __init__(self, *args, **kwargs):
}
self.update_url()

# https://www.coloradojudicial.gov/system/files/opinions-2024-11/24SC459.pdf
# Request won't work without some of these X- headers
self.request["headers"].update(
{
Expand All @@ -60,6 +67,27 @@ def __init__(self, *args, **kwargs):
self.expected_content_types = ["text/html"]
self.make_backscrape_iterable(kwargs)

def update_case(self, case: dict, detail_json: dict) -> dict:
"""Update case dictionary with nested properties
:param case: the case data
:param detail_json: The json response
:return: The updated case data
"""
for p in detail_json["properties"]:
label = p["property"]["label"]
values = p["values"]
if label in self.label_to_key:
key = self.label_to_key[label]
if label == "Citation":
case[key] = values[0]
if len(values) > 1:
case["parallel_citation"] = values[1]
else:
case[key] = values[0]
case["status"] = "Published" if case["citation"] else "Unpublished"
return case

def _process_html(self) -> None:
search_json = self.html
logger.info(
Expand All @@ -69,9 +97,9 @@ def _process_html(self) -> None:
)

for result in search_json["results"]:
case = {"citation": "", "parallel_citation": ""}
timestamp = str(datetime.now().timestamp())[:10]
url = self.detail_url.format(result["id"], timestamp)

if self.test_mode_enabled():
# we have manually nested detail JSONs to
# to be able to have a test file
Expand All @@ -82,37 +110,17 @@ def _process_html(self) -> None:
self._request_url_get(url)
detail_json = self.request["response"].json()

# Reset variables to prevent sticking previous values
# when a value is missing
docket_number, case_name_full, date_filed = "", "", ""

# Example of parallel citation:
# https://research.coloradojudicial.gov/vid/907372624
citation, parallel_citation = "", ""
for p in detail_json["properties"]:
label = p["property"]["label"]
if label == "Docket Number":
docket_number = p["values"][0]
elif label == "Parties":
case_name_full = p["values"][0]
elif label == "Decision Date":
# Note that json['published_at'] is not the date_filed
date_filed = p["values"][0]
elif label == "Citation":
citation = p["values"][0]
if len(p["values"]) > 1:
parallel_citation = p["values"][1]

case = {
"date": date_filed,
"docket": docket_number,
"name": case_name_full,
"url": f"{detail_json['public_url']}/content",
"status": "Published" if citation else "Unknown",
"citation": citation,
"parallel_citation": parallel_citation,
}
if (
self.court_id
== "juriscraper.opinions.united_states.state.colo"
):
case["url"] = f"{detail_json['public_url']}/content"
else:
case["url"] = (
f"https://colorado.vlex.io/pdf_viewer/{result.get('id')}"
)

case = self.update_case(case, detail_json)
self.cases.append(case)

def _download_backwards(self, dates: Tuple[date]) -> None:
Expand Down
40 changes: 3 additions & 37 deletions juriscraper/opinions/united_states/state/coloctapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,40 +20,6 @@ class Site(colo.Site):
api_court_code = "14024_02"
days_interval = 15

@staticmethod
def cleanup_content(content: str) -> str:
"""Returned HTML may need editing for proper ingestion
The HTML seems to change constantly, so some of these
steps may be outdated (Check juriscraper#1198 and courtlistener#4443)
- delete style and img tags which hold tokens
that make the hash change everytime
- delete classes which conflict with our bootstrap
classes, such as .h2 and .h3
:param content: html string
:return: cleaned up html
"""
tree = html.fromstring(content)
remove_tags = ["//style", "//img"]
remove_attributes = [
"//*[@class]",
# contains json like data with "ctm" key
"//*[@data-data]",
# contains coordinate like data
"//*[@data-dest-detail]",
]
for xpath in remove_tags:
for element in tree.xpath(xpath):
element.getparent().remove(element)

for xpath in remove_attributes:
attrib = re.search(r"[\w-]+", xpath).group(0)
for element in tree.xpath(xpath):
element.attrib.pop(attrib)

return html.tostring(
tree, pretty_print=True, encoding="unicode"
).encode("utf-8")
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.expected_content_types = ["application/pdf"]
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"case_dates": "2024-07-01",
"case_names": "In re the Marriage of Elyssa M. Fox, and Alexander L. Speaker",
"download_urls": "https://colorado.vlex.io/vid/in-re-marriage-of-1042461692/content",
"precedential_statuses": "Unknown",
"precedential_statuses": "Unpublished",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"docket_numbers": "24SC76",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
{
"case_dates": "2024-05-16",
"case_names": "The People of the State of Colorado, In the Interest of T.C., Jr., a Child, and Concerning L.N.P. and T.C.",
"download_urls": "https://colorado.vlex.io/vid/people-ex-rel-t-1035297640/content",
"precedential_statuses": "Unknown",
"download_urls": "https://colorado.vlex.io/pdf_viewer/1035297640",
"precedential_statuses": "Unpublished",
"blocked_statuses": false,
"date_filed_is_approximate": false,
"docket_numbers": "23CA1539",
Expand All @@ -14,7 +14,7 @@
{
"case_dates": "2024-05-16",
"case_names": "Andrew Ortiz v. Progressive Direct Insurance Company",
"download_urls": "https://colorado.vlex.io/vid/ortiz-v-progressive-direct-1035272272/content",
"download_urls": "https://colorado.vlex.io/pdf_viewer/1035272272",
"precedential_statuses": "Published",
"blocked_statuses": false,
"date_filed_is_approximate": false,
Expand Down

0 comments on commit ab37154

Please sign in to comment.