Skip to content

Commit

Permalink
Merge pull request #1201 from freelawproject/fix_or_index_error
Browse files Browse the repository at this point in the history
fix(or): update scraper to solve IndexError
  • Loading branch information
flooie authored Oct 10, 2024
2 parents 865a726 + ab19c60 commit e2908ad
Show file tree
Hide file tree
Showing 10 changed files with 6,308 additions and 6,965 deletions.
123 changes: 116 additions & 7 deletions juriscraper/opinions/united_states/state/or.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,124 @@
- 2023-11-18: Fixed and updated
"""

from juriscraper.opinions.united_states.state import orctapp
from datetime import datetime, timedelta

from juriscraper.AbstractSite import logger
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
court_code = "p17027coll3"
base_url = "https://cdm17027.contentdm.oclc.org/digital/api/search/collection/{}/searchterm/{}-{}/field/dated/mode/exact/conn/and/maxRecords/200"
# technically they have an 1870 case but just one
first_opinion_date = datetime(1997, 8, 12)
days_interval = 15

class Site(orctapp.Site):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = (
"https://www.courts.oregon.gov/publications/sc/Pages/default.aspx"
)
self.status = "Published"
self.court_code = "p17027coll3"
today = datetime.today()
self.url = self.format_url(today - timedelta(15), today)
self.make_backscrape_iterable(kwargs)

def _process_html(self):
for row in self.html["items"]:
docket, name, citation, date = (
x["value"] for x in row["metadataFields"]
)
if not name:
# Happens on rows like:
# "Miscellaneous Supreme Court dispositions, June 10 and 13, 2024"
logger.info("Skipping row '%s'", docket)
continue

judge, disposition, status, lower_court_number = self.get_details(
row
)
per_curiam = False
if judge and judge == "PC" or "per curiam" in judge.lower():
per_curiam = True
judge = ""

self.cases.append(
{
"name": name,
"date": date,
"docket": docket.split(",")[0],
"url": f"https://ojd.contentdm.oclc.org/digital/api/collection/{row['collectionAlias']}/id/{row['itemId']}/download",
"citation": citation,
"judge": judge,
"per_curiam": per_curiam,
"status": status,
"disposition": disposition,
"lower_court_number": lower_court_number,
}
)

def get_details(self, row: dict) -> tuple[str, str, str, str]:
"""Makes a secondary request to get details for a single
opinion
:param row: the JSON records, to get the item id for the request
or the JSON object in tests
:return: a tuple containing, if it has a valid value
- judge
- disposition
- status
- lower court number (only for `or`)
"""
if self.test_mode_enabled():
if not row.get("detailJson"):
return (
"placeholder judge",
"placeholder disposition",
"Unknown",
"placeholder lower court number",
)
# Some test cases have their detail data manually copy pasted
json = row["detailJson"]
else:
item_id = row["itemId"]
url = f"https://cdm17027.contentdm.oclc.org/digital/api/collections/{self.court_code}/items/{item_id}/false"
json = self.request["session"].get(url).json()
logger.debug("Getting detail JSON from %s", url)

if len(json["fields"]) == 1:
fields = json["parent"]["fields"]
else:
fields = json["fields"]

judge, disposition, status, lower_court_number = "", "", "Unknown", ""
for field in fields:
if field["key"] == "judge":
judge = field["value"]
elif field["key"] == "type":
if field["value"] == "Nonprecedential opinion":
status = "Unpublished"
else:
status = "Published"
elif field["key"] == "descri":
disposition = field["value"]
elif field["key"] == "relhapt":
# For orctapp this field may be populated with consolidated docket
# numbers
if self.court_id.endswith("or") and not field[
"value"
].startswith("S"):
lower_court_number = field["value"]

return judge, disposition, status, lower_court_number

def _download_backwards(self, dates: tuple) -> None:
logger.info("Backscraping for range %s %s", *dates)
self.url = self.format_url(*dates)
self.html = self._download()
self._process_html()

def format_url(self, start_date: datetime, end_date: datetime) -> str:
"""
Creates a date range URL by formatting input dates
"""
start = datetime.strftime(start_date, "%Y%m%d")
end = datetime.strftime(end_date, "%Y%m%d")
return self.base_url.format(self.court_code, start, end)
62 changes: 7 additions & 55 deletions juriscraper/opinions/united_states/state/orctapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,62 +7,14 @@
- 2023-11-18: Created
"""

from juriscraper.DeferringList import DeferringList
from juriscraper.OpinionSiteLinear import OpinionSiteLinear
from importlib import import_module

# `or` is a python reserved keyword; can't import the module as usual
oregon_module = import_module("juriscraper.opinions.united_states.state.or")

class Site(OpinionSiteLinear):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = (
"https://www.courts.oregon.gov/publications/coa/Pages/default.aspx"
)
self.cases = []
self.status = "Published"
self.court_code = "p17027coll5"

def fetch_url_json(self, identifier):
""""""
url = f"https://ojd.contentdm.oclc.org/digital/bl/dmwebservices/index.php?q=dmQuery/{self.court_code}/identi^{identifier}^all^and/title!subjec!descri!dmrecord/title/1024/1/0/0/0/0/json"
json = self.request["session"].get(url).json()
return f"https://ojd.contentdm.oclc.org/digital/api/collection/{self.court_code}/id/{json['records'][0]['pointer']}/download"

def _process_html(self):
for header in self.html.xpath("//h4//a/parent::h4"):
date_string = header.text_content().strip()
if not date_string:
continue
ul = header.xpath("./following-sibling::ul")[0]
for item in ul.xpath(".//li"):
# Ensure two links are present (skip Petitions for Review rows)
# see or_example_2.html
anchors = item.xpath(".//a")
if not (len(anchors) > 1):
continue
text = item.text_content().strip()
url = anchors[0].xpath("./@href")[0]
docket = anchors[1].text_content().strip()
name = text.split(")", 1)[-1]
self.cases.append(
{
"date": date_string,
"name": name,
"docket": docket,
"url": url,
}
)

def _get_download_urls(self):
"""Get download urls

:return: List URLs
"""
class Site(oregon_module.Site):
court_code = "p17027coll5"

def fetcher(case):
if self.test_mode_enabled():
return case["url"]

return self.fetch_url_json(case["url"].split("=")[-1][:-4])

return DeferringList(seed=self.cases, fetcher=fetcher)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Loading

0 comments on commit e2908ad

Please sign in to comment.