From 9864e84dd9574e7cf41ac1679c33a3e2126d3caf Mon Sep 17 00:00:00 2001 From: showerst Date: Mon, 4 Dec 2023 16:50:56 -0500 Subject: [PATCH] OH: Events: Add agendas and bills (#4516) --- scrapers/oh/events.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/scrapers/oh/events.py b/scrapers/oh/events.py index 6d63444d56..d4331b7684 100644 --- a/scrapers/oh/events.py +++ b/scrapers/oh/events.py @@ -51,12 +51,20 @@ class OHEventScraper(Scraper): _tz = pytz.timezone("US/Eastern") base_url = "https://www.legislature.ohio.gov/schedules/" + api_base_url = "https://search-prod.lis.state.oh.us/" + session_id = "" scraper = cloudscraper.create_scraper() + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36" + } dedupe_keys = set() def scrape(self, start=None, end=None): + # pull the newest session id from __init__.py + self.session_id = self.jurisdiction.legislative_sessions[-1]["identifier"] + if start is None: start = dt.datetime.today() else: @@ -72,6 +80,7 @@ def scrape(self, start=None, end=None): url = f"{self.base_url}calendar-data?start={start}&end={end}" try: + self.info(f"Fetching {url}") data = json.loads(self.scraper.get(url).content) except Exception: raise EmptyScrape @@ -137,6 +146,9 @@ def scrape(self, start=None, end=None): else: self.dedupe_keys.add(event_key) + if re.match(r"^.*\shearing room", location, flags=re.IGNORECASE): + location = f"{location}, 1 Capitol Square, Columbus, OH 43215" + event = Event( name=name, start_date=when, location_name=location, status=status ) @@ -155,6 +167,25 @@ def scrape(self, start=None, end=None): event.add_source(url) + # API has more data on agenda and bills, ex: + # https://search-prod.lis.state.oh.us/solarapi/v1/general_assembly_135/notices/cmte_s_health_1/2023-03-01 + + com_id = re.search(r"\/([a-z_]+\d)", item["url"], flags=re.IGNORECASE) + if com_id: + com_id = com_id.group(1) + hearing_date = when.strftime("%Y-%m-%d") + api_url = f"{self.api_base_url}/solarapi/v1/general_assembly_{self.session_id}/notices/{com_id}/{hearing_date}?format=json" + self.info(f"Fetching {api_url}") + api_data = json.loads(self.scraper.get(api_url).content) + for row in api_data["agenda"]: + item_text = f"{row['headline']} - {row['proposed_sponsor']}" + agenda_item = event.add_agenda_item(item_text) + if "billno" in row: + agenda_item.add_bill(row["billno"]) + + # Note: there's an api element called 'testimonies' that appears to just be + # witness registration forms from the bill sponsors, so we're skipping those. + event_count += 1 yield event