Merge pull request #1201 from freelawproject/fix_or_index_error

fix(or): update scraper to solve IndexError
freelawproject · Oct 10, 2024 · e2908ad · e2908ad
2 parents 865a726 + ab19c60
commit e2908ad
Show file tree

Hide file tree

Showing 10 changed files with 6,308 additions and 6,965 deletions.
diff --git a/juriscraper/opinions/united_states/state/or.py b/juriscraper/opinions/united_states/state/or.py
@@ -4,15 +4,124 @@
  - 2023-11-18: Fixed and updated
 """
 
-from juriscraper.opinions.united_states.state import orctapp
+from datetime import datetime, timedelta
 
+from juriscraper.AbstractSite import logger
+from juriscraper.OpinionSiteLinear import OpinionSiteLinear
+
+
+class Site(OpinionSiteLinear):
+    court_code = "p17027coll3"
+    base_url = "https://cdm17027.contentdm.oclc.org/digital/api/search/collection/{}/searchterm/{}-{}/field/dated/mode/exact/conn/and/maxRecords/200"
+    # technically they have an 1870 case but just one
+    first_opinion_date = datetime(1997, 8, 12)
+    days_interval = 15
 
-class Site(orctapp.Site):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.court_id = self.__module__
-        self.url = (
-            "https://www.courts.oregon.gov/publications/sc/Pages/default.aspx"
-        )
-        self.status = "Published"
-        self.court_code = "p17027coll3"
+        today = datetime.today()
+        self.url = self.format_url(today - timedelta(15), today)
+        self.make_backscrape_iterable(kwargs)
+
+    def _process_html(self):
+        for row in self.html["items"]:
+            docket, name, citation, date = (
+                x["value"] for x in row["metadataFields"]
+            )
+            if not name:
+                # Happens on rows like:
+                # "Miscellaneous Supreme Court dispositions, June 10 and 13, 2024"
+                logger.info("Skipping row '%s'", docket)
+                continue
+
+            judge, disposition, status, lower_court_number = self.get_details(
+                row
+            )
+            per_curiam = False
+            if judge and judge == "PC" or "per curiam" in judge.lower():
+                per_curiam = True
+                judge = ""
+
+            self.cases.append(
+                {
+                    "name": name,
+                    "date": date,
+                    "docket": docket.split(",")[0],
+                    "url": f"https://ojd.contentdm.oclc.org/digital/api/collection/{row['collectionAlias']}/id/{row['itemId']}/download",
+                    "citation": citation,
+                    "judge": judge,
+                    "per_curiam": per_curiam,
+                    "status": status,
+                    "disposition": disposition,
+                    "lower_court_number": lower_court_number,
+                }
+            )
+
+    def get_details(self, row: dict) -> tuple[str, str, str, str]:
+        """Makes a secondary request to get details for a single
+        opinion
+
+        :param row: the JSON records, to get the item id for the request
+            or the JSON object in tests
+        :return: a tuple containing, if it has a valid value
+            - judge
+            - disposition
+            - status
+            - lower court number (only for `or`)
+        """
+        if self.test_mode_enabled():
+            if not row.get("detailJson"):
+                return (
+                    "placeholder judge",
+                    "placeholder disposition",
+                    "Unknown",
+                    "placeholder lower court number",
+                )
+            # Some test cases have their detail data manually copy pasted
+            json = row["detailJson"]
+        else:
+            item_id = row["itemId"]
+            url = f"https://cdm17027.contentdm.oclc.org/digital/api/collections/{self.court_code}/items/{item_id}/false"
+            json = self.request["session"].get(url).json()
+            logger.debug("Getting detail JSON from %s", url)
+
+        if len(json["fields"]) == 1:
+            fields = json["parent"]["fields"]
+        else:
+            fields = json["fields"]
+
+        judge, disposition, status, lower_court_number = "", "", "Unknown", ""
+        for field in fields:
+            if field["key"] == "judge":
+                judge = field["value"]
+            elif field["key"] == "type":
+                if field["value"] == "Nonprecedential opinion":
+                    status = "Unpublished"
+                else:
+                    status = "Published"
+            elif field["key"] == "descri":
+                disposition = field["value"]
+            elif field["key"] == "relhapt":
+                # For orctapp this field may be populated with consolidated docket
+                # numbers
+                if self.court_id.endswith("or") and not field[
+                    "value"
+                ].startswith("S"):
+                    lower_court_number = field["value"]
+
+        return judge, disposition, status, lower_court_number
+
+    def _download_backwards(self, dates: tuple) -> None:
+        logger.info("Backscraping for range %s %s", *dates)
+        self.url = self.format_url(*dates)
+        self.html = self._download()
+        self._process_html()
+
+    def format_url(self, start_date: datetime, end_date: datetime) -> str:
+        """
+        Creates a date range URL by formatting input dates
+        """
+        start = datetime.strftime(start_date, "%Y%m%d")
+        end = datetime.strftime(end_date, "%Y%m%d")
+        return self.base_url.format(self.court_code, start, end)
diff --git a/juriscraper/opinions/united_states/state/orctapp.py b/juriscraper/opinions/united_states/state/orctapp.py
@@ -7,62 +7,14 @@
     - 2023-11-18: Created
 """
 
-from juriscraper.DeferringList import DeferringList
-from juriscraper.OpinionSiteLinear import OpinionSiteLinear
+from importlib import import_module
 
+# `or` is a python reserved keyword; can't import the module as usual
+oregon_module = import_module("juriscraper.opinions.united_states.state.or")
 
-class Site(OpinionSiteLinear):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.court_id = self.__module__
-        self.url = (
-            "https://www.courts.oregon.gov/publications/coa/Pages/default.aspx"
-        )
-        self.cases = []
-        self.status = "Published"
-        self.court_code = "p17027coll5"
-
-    def fetch_url_json(self, identifier):
-        """"""
-        url = f"https://ojd.contentdm.oclc.org/digital/bl/dmwebservices/index.php?q=dmQuery/{self.court_code}/identi^{identifier}^all^and/title!subjec!descri!dmrecord/title/1024/1/0/0/0/0/json"
-        json = self.request["session"].get(url).json()
-        return f"https://ojd.contentdm.oclc.org/digital/api/collection/{self.court_code}/id/{json['records'][0]['pointer']}/download"
-
-    def _process_html(self):
-        for header in self.html.xpath("//h4//a/parent::h4"):
-            date_string = header.text_content().strip()
-            if not date_string:
-                continue
-            ul = header.xpath("./following-sibling::ul")[0]
-            for item in ul.xpath(".//li"):
-                # Ensure two links are present (skip Petitions for Review rows)
-                # see or_example_2.html
-                anchors = item.xpath(".//a")
-                if not (len(anchors) > 1):
-                    continue
-                text = item.text_content().strip()
-                url = anchors[0].xpath("./@href")[0]
-                docket = anchors[1].text_content().strip()
-                name = text.split(")", 1)[-1]
-                self.cases.append(
-                    {
-                        "date": date_string,
-                        "name": name,
-                        "docket": docket,
-                        "url": url,
-                    }
-                )
-
-    def _get_download_urls(self):
-        """Get download urls
 
-        :return: List URLs
-        """
+class Site(oregon_module.Site):
+    court_code = "p17027coll5"
 
-        def fetcher(case):
-            if self.test_mode_enabled():
-                return case["url"]
-
-            return self.fetch_url_json(case["url"].split("=")[-1][:-4])
-
-        return DeferringList(seed=self.cases, fetcher=fetcher)
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)