Skip to content

Commit

Permalink
Merge branch 'main' into improve-okla-cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
flooie authored Oct 21, 2024
2 parents 35a222f + 1d47cc6 commit 8679a81
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 11 deletions.
10 changes: 7 additions & 3 deletions juriscraper/opinions/united_states/state/colo.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,19 +83,23 @@ def _process_html(self) -> None:
self._request_url_get(url)
detail_json = self.request["response"].json()

# Reset variables to prevent sticking previous values
# when a value is missing
docket_number, case_name_full, date_filed = "", "", ""

# Example of parallel citation:
# https://research.coloradojudicial.gov/vid/907372624
citation, parallel_citation = "", ""
for p in detail_json["properties"]:
label = p["property"]["label"]
if label == "Docket Number":
docket_number = p["values"][0]
if label == "Parties":
elif label == "Parties":
case_name_full = p["values"][0]
if label == "Decision Date":
elif label == "Decision Date":
# Note that json['published_at'] is not the date_filed
date_filed = p["values"][0]
if label == "Citation":
elif label == "Citation":
citation = p["values"][0]
if len(p["values"]) > 1:
parallel_citation = p["values"][1]
Expand Down
26 changes: 18 additions & 8 deletions juriscraper/opinions/united_states/state/coloctapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
- 2023-11-19: Updated by William E. Palin
"""

import re

from lxml import html

from juriscraper.opinions.united_states.state import colo
Expand All @@ -35,14 +37,22 @@ def cleanup_content(content: str) -> str:
:return: cleaned up html
"""
tree = html.fromstring(content)
remove_xpaths = ["//style", "//img"]
for xpath in remove_xpaths:
if tree.xpath(xpath):
to_remove = tree.xpath(xpath)[0]
to_remove.getparent().remove(to_remove)

for tag in tree.xpath("//*[@class]"):
tag.attrib.pop("class")
remove_tags = ["//style", "//img"]
remove_attributes = [
"//*[@class]",
# contains json like data with "ctm" key
"//*[@data-data]",
# contains coordinate like data
"//*[@data-dest-detail]",
]
for xpath in remove_tags:
for element in tree.xpath(xpath):
element.getparent().remove(element)

for xpath in remove_attributes:
attrib = re.search(r"[\w-]+", xpath).group(0)
for element in tree.xpath(xpath):
element.attrib.pop(attrib)

return html.tostring(
tree, pretty_print=True, encoding="unicode"
Expand Down

0 comments on commit 8679a81

Please sign in to comment.