Merge branch 'main' into improve-okla-cleanup

freelawproject · Oct 21, 2024 · 8679a81 · 8679a81
2 parents 35a222f + 1d47cc6
commit 8679a81
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 11 deletions.
diff --git a/juriscraper/opinions/united_states/state/colo.py b/juriscraper/opinions/united_states/state/colo.py
@@ -83,19 +83,23 @@ def _process_html(self) -> None:
                 self._request_url_get(url)
                 detail_json = self.request["response"].json()
 
+            # Reset variables to prevent sticking previous values
+            # when a value is missing
+            docket_number, case_name_full, date_filed = "", "", ""
+
             # Example of parallel citation:
             # https://research.coloradojudicial.gov/vid/907372624
             citation, parallel_citation = "", ""
             for p in detail_json["properties"]:
                 label = p["property"]["label"]
                 if label == "Docket Number":
                     docket_number = p["values"][0]
-                if label == "Parties":
+                elif label == "Parties":
                     case_name_full = p["values"][0]
-                if label == "Decision Date":
+                elif label == "Decision Date":
                     # Note that json['published_at'] is not the date_filed
                     date_filed = p["values"][0]
-                if label == "Citation":
+                elif label == "Citation":
                     citation = p["values"][0]
                     if len(p["values"]) > 1:
                         parallel_citation = p["values"][1]

diff --git a/juriscraper/opinions/united_states/state/coloctapp.py b/juriscraper/opinions/united_states/state/coloctapp.py
@@ -9,6 +9,8 @@
     - 2023-11-19: Updated by William E. Palin
 """
 
+import re
+
 from lxml import html
 
 from juriscraper.opinions.united_states.state import colo
@@ -35,14 +37,22 @@ def cleanup_content(content: str) -> str:
         :return: cleaned up html
         """
         tree = html.fromstring(content)
-        remove_xpaths = ["//style", "//img"]
-        for xpath in remove_xpaths:
-            if tree.xpath(xpath):
-                to_remove = tree.xpath(xpath)[0]
-                to_remove.getparent().remove(to_remove)
-
-        for tag in tree.xpath("//*[@class]"):
-            tag.attrib.pop("class")
+        remove_tags = ["//style", "//img"]
+        remove_attributes = [
+            "//*[@class]",
+            # contains json like data with "ctm" key
+            "//*[@data-data]",
+            # contains coordinate like data
+            "//*[@data-dest-detail]",
+        ]
+        for xpath in remove_tags:
+            for element in tree.xpath(xpath):
+                element.getparent().remove(element)
+
+        for xpath in remove_attributes:
+            attrib = re.search(r"[\w-]+", xpath).group(0)
+            for element in tree.xpath(xpath):
+                element.attrib.pop(attrib)
 
         return html.tostring(
             tree, pretty_print=True, encoding="unicode"