Skip to content

Commit

Permalink
Merge pull request #1247 from freelawproject/fix_colo
Browse files Browse the repository at this point in the history
fix(colo, coloctapp): update scraper for api changes
  • Loading branch information
flooie authored Nov 21, 2024
2 parents d6dd494 + d82a399 commit 76186d0
Showing 1 changed file with 23 additions and 12 deletions.
35 changes: 23 additions & 12 deletions juriscraper/opinions/united_states/state/colo.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
- 2024-07-04: Update to new site, grossir
"""

from datetime import date, datetime
from typing import Tuple
from datetime import date, datetime, timedelta
from typing import Optional, Tuple
from urllib.parse import urlencode

from juriscraper.AbstractSite import logger
Expand All @@ -31,7 +31,7 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.params = {
"product_id": "WW",
"product_id": "COLORADO",
"jurisdiction": "US",
"content_type": "2",
"court": self.api_court_code,
Expand All @@ -40,14 +40,13 @@ def __init__(self, *args, **kwargs):
"per_page": "30", # Server breaks down when per_page=500, returns 503
"page": "1",
"sort": "date",
"type": "document",
"include_local_exclusive": "true",
"cbm": "6.0|361.0|5.0|9.0|4.0|2.0=0.01|400.0|1.0|0.001|1.5|0.2",
"locale": "en",
"hide_ct6": "true",
"t": str(datetime.now().timestamp())[:10],
"type": "document",
}
self.url = f"{self.base_url}?{urlencode(self.params)}"
self.update_url()

# Request won't work without some of these X- headers
self.request["headers"].update(
Expand Down Expand Up @@ -123,19 +122,31 @@ def _download_backwards(self, dates: Tuple[date]) -> None:
:return None
"""
logger.info("Backscraping for range %s %s", *dates)
self.update_url(dates)
self.html = self._download()
self._process_html()

def update_url(self, dates: Optional[Tuple[date]] = None) -> None:
"""
Set URL with date filters and current timestamp.
Request with no date filter was returning very old documents
instead of the most recent ones
:param dates: start and end date tuple. If not present,
scrape last week
"""
if not dates:
today = datetime.now()
dates = (today - timedelta(7), today + timedelta(1))

start = dates[0].strftime("%Y-%m-%d")
end = dates[1].strftime("%Y-%m-%d")
timestamp = str(datetime.now().timestamp())[:10]
params = {**self.params}
params.update(
{
"date": f"{start}..{end}",
# These are duplicated by the frontend too
"locale": ["en", "en"],
"hide_ct6": ["true", "true"],
"t": [timestamp, timestamp],
"t": timestamp,
}
)
self.url = f"{self.base_url}?{urlencode(params)}"
self.html = self._download()
self._process_html()

0 comments on commit 76186d0

Please sign in to comment.