diff --git a/scrapers/oh/__init__.py b/scrapers/oh/__init__.py index 012b473cd0..4f4ed31ce3 100644 --- a/scrapers/oh/__init__.py +++ b/scrapers/oh/__init__.py @@ -40,6 +40,16 @@ class Ohio(State): "end_date": "2022-12-31", "active": False, }, + { + "_scraped_name": "135th - Special Session (2023)", + "classification": "special", + "identifier": "135S1", + "name": "135th Legislature, First Special Session", + "start_date": "2024-05-28", + "end_date": "2024-06-12", + "active": True, + "extras": {"session_id": "135_special_1", "session_url_slug": "135-s1"}, + }, { "_scraped_name": "135th (2023-2024)", "identifier": "135", diff --git a/scrapers/oh/bills.py b/scrapers/oh/bills.py index b248c00f7d..cb2098dbe7 100644 --- a/scrapers/oh/bills.py +++ b/scrapers/oh/bills.py @@ -4,11 +4,17 @@ import pytz import re import dateutil +import requests BAD_BILLS = [("134", "SB 92")] +requests.packages.urllib3.disable_warnings() + class OHBillScraper(Scraper): + short_base_url = "https://search-prod.lis.state.oh.us" + base_url = "" + session_url_slug = "" _tz = pytz.timezone("US/Eastern") # Vote Motion Dictionary was created by comparing vote codes to @@ -47,324 +53,317 @@ def scrape(self, session=None, chambers=None): "User-Agent" ] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36" - if int(session) < 128: - raise AssertionError(f"No data for period {session}") - - elif int(session) < 131: - # they changed their data format starting in 131st and added - # an undocumented API - yield from self.old_scrape(session) - - else: - chamber_dict = { - "Senate": "upper", - "House": "lower", - "House of Representatives": "lower", - "house": "lower", - "senate": "upper", - } - - # so presumably not everything passes, but we haven't - # seen anything not pass yet, so we'll need to wait - # till it fails and get the right language in here - vote_results = { - "approved": True, - "passed": True, - "adopted": True, - "true": True, - "false": False, - "failed": False, - True: True, - False: False, - } - - action_dict = { - "ref_ctte_100": "referral-committee", - "intro_100": "introduction", - "intro_101": "introduction", - "pass_300": "passage", - "intro_110": "reading-1", - "refer_210": "referral-committee", - "crpt_301": None, - "crpt_317": None, - "concur_606": "passage", - "pass_301": "passage", - "refer_220": "referral-committee", - "intro_102": ["introduction", "passage"], - "intro_105": ["introduction", "passage"], - "intro_ref_ctte_100": "referral-committee", - "refer_209": None, - "intro_108": ["introduction", "passage"], - "intro_103": ["introduction", "passage"], - "msg_reso_503": "passage", - "intro_107": ["introduction", "passage"], - "imm_consid_360": "passage", - "refer_213": None, - "adopt_reso_100": "passage", - "adopt_reso_110": "passage", - "msg_507": "amendment-passage", - "confer_713": None, - "concur_603": None, - "confer_712": None, - "msg_506": "amendment-failure", - "receive_message_100": "passage", - "motion_920": None, - "concur_611": None, - "confer_735": None, - "third_429": None, - "final_501": None, - "concur_608": None, - "infpass_217": "passage", - } - - base_url = "https://search-prod.lis.state.oh.us" - first_page = base_url - first_page += "/solarapi/v1/general_assembly_{session}/".format( - session=session + session_id = session + session_url_slug = session + for i in self.jurisdiction.legislative_sessions: + if i["identifier"] == session: + if "extras" in i and "session_id" in i["extras"]: + session_id = i["extras"]["session_id"] + session_url_slug = i["extras"]["session_url_slug"] + + self.base_url = f"https://search-prod.lis.state.oh.us/solarapi/v1/general_assembly_{session_id}/" + + chamber_dict = { + "Senate": "upper", + "House": "lower", + "House of Representatives": "lower", + "house": "lower", + "senate": "upper", + } + + # so presumably not everything passes, but we haven't + # seen anything not pass yet, so we'll need to wait + # till it fails and get the right language in here + vote_results = { + "approved": True, + "passed": True, + "adopted": True, + "true": True, + "false": False, + "failed": False, + True: True, + False: False, + } + + action_dict = { + "ref_ctte_100": "referral-committee", + "intro_100": "introduction", + "intro_101": "introduction", + "pass_300": "passage", + "intro_110": "reading-1", + "refer_210": "referral-committee", + "crpt_301": None, + "crpt_317": None, + "concur_606": "passage", + "pass_301": "passage", + "refer_220": "referral-committee", + "intro_102": ["introduction", "passage"], + "intro_105": ["introduction", "passage"], + "intro_ref_ctte_100": "referral-committee", + "refer_209": None, + "intro_108": ["introduction", "passage"], + "intro_103": ["introduction", "passage"], + "msg_reso_503": "passage", + "intro_107": ["introduction", "passage"], + "imm_consid_360": "passage", + "refer_213": None, + "adopt_reso_100": "passage", + "adopt_reso_110": "passage", + "msg_507": "amendment-passage", + "confer_713": None, + "concur_603": None, + "confer_712": None, + "msg_506": "amendment-failure", + "receive_message_100": "passage", + "motion_920": None, + "concur_611": None, + "confer_735": None, + "third_429": None, + "final_501": None, + "concur_608": None, + "infpass_217": "passage", + } + + first_page = self.base_url + legislators = self.get_legislator_ids(first_page) + all_amendments = self.get_other_data_source( + first_page, self.base_url, "amendments" + ) + all_fiscals = self.get_other_data_source(first_page, self.base_url, "fiscals") + all_synopsis = self.get_other_data_source( + first_page, self.base_url, "synopsiss" + ) + all_analysis = self.get_other_data_source( + first_page, self.base_url, "analysiss" + ) + + bills = self.get_total_bills(session) + for bill in bills: + bill_name = bill["name"] + bill_number = bill["number"] + + # S.R.No.1 -> SR1 + bill_id = bill_name.replace("No.", "").strip() + bill_id = bill_id.replace(".", "").replace(" ", "").strip() + # put one space back in between type and number + bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id) + + chamber = "lower" if "H" in bill_id else "upper" + classification = "bill" if "B" in bill_id else "resolution" + + title = bill["shorttitle"] if bill["shorttitle"] else "No title provided" + bill = Bill( + bill_id, + legislative_session=session, + chamber=chamber, + title=title, + classification=classification, ) - legislators = self.get_legislator_ids(first_page) - all_amendments = self.get_other_data_source( - first_page, base_url, "amendments" + bill.add_source( + f"https://www.legislature.ohio.gov/legislation/{session_url_slug}/{bill_number}" ) - all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals") - all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss") - all_analysis = self.get_other_data_source(first_page, base_url, "analysiss") - - bills = self.get_total_bills(session) - for bill in bills: - bill_name = bill["name"] - bill_number = bill["number"] - - # S.R.No.1 -> SR1 - bill_id = bill_name.replace("No.", "").strip() - bill_id = bill_id.replace(".", "").replace(" ", "").strip() - # put one space back in between type and number - bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id) - - chamber = "lower" if "H" in bill_id else "upper" - classification = "bill" if "B" in bill_id else "resolution" - - title = ( - bill["shorttitle"] if bill["shorttitle"] else "No title provided" - ) - bill = Bill( - bill_id, - legislative_session=session, - chamber=chamber, - title=title, - classification=classification, - ) - bill.add_source( - f"https://www.legislature.ohio.gov/legislation/{session}/{bill_number}" - ) - if (session, bill_id) in BAD_BILLS: - self.logger.warning( - f"Skipping details for known bad bill {bill_id}" - ) - yield bill - continue + if (session, bill_id) in BAD_BILLS: + self.logger.warning(f"Skipping details for known bad bill {bill_id}") + yield bill + continue - # get bill from API - bill_api_url = ( - "https://search-prod.lis.state.oh.us/solarapi/v1/" - "general_assembly_{}/{}/{}/".format( - session, - "bills" if "B" in bill_id else "resolutions", - bill_id.lower().replace(" ", ""), + # get bill from API + bill_api_url = "{}/{}/{}/".format( + self.base_url, + "bills" if "B" in bill_id else "resolutions", + bill_id.lower().replace(" ", ""), + ) + data = self.get(bill_api_url, verify=False).json() + if len(data["items"]) == 0: + self.logger.warning( + "Data for bill {bill_id} has empty 'items' array," + " cannot process related information".format( + bill_id=bill_id.lower().replace(" ", "") ) ) - data = self.get(bill_api_url, verify=False).json() - if len(data["items"]) == 0: - self.logger.warning( - "Data for bill {bill_id} has empty 'items' array," - " cannot process related information".format( - bill_id=bill_id.lower().replace(" ", "") - ) - ) - yield bill - continue - - # add title if no short title - if not bill.title: - bill.title = data["items"][0]["longtitle"] - bill.add_title(data["items"][0]["longtitle"], "long title") - - # this stuff is version-specific - for version in data["items"]: - version_name = version["version"] - version_link = base_url + version["pdfDownloadLink"] - bill.add_version_link( - version_name, version_link, media_type="application/pdf" - ) - - # we'll use the latest bill_version for everything else - bill_version = data["items"][0] - bill.add_source(bill_api_url) + yield bill + continue - # subjects - for subj in bill_version["subjectindexes"]: - try: - bill.add_subject(subj["primary"]) - except KeyError: - pass - try: - secondary_subj = subj["secondary"] - except KeyError: - secondary_subj = "" - if secondary_subj: - bill.add_subject(secondary_subj) - - # sponsors - sponsors = bill_version["sponsors"] - for sponsor in sponsors: - sponsor_name = self.get_sponsor_name(sponsor) - bill.add_sponsorship( - sponsor_name, - classification="primary", - entity_type="person", - primary=True, - ) + # add title if no short title + if not bill.title: + bill.title = data["items"][0]["longtitle"] + bill.add_title(data["items"][0]["longtitle"], "long title") + + # this stuff is version-specific + for version in data["items"]: + version_name = version["version"] + version_link = self.short_base_url + version["pdfDownloadLink"] + bill.add_version_link( + version_name, version_link, media_type="application/pdf" + ) - cosponsors = bill_version["cosponsors"] - for sponsor in cosponsors: - sponsor_name = self.get_sponsor_name(sponsor) - bill.add_sponsorship( - sponsor_name, - classification="cosponsor", - entity_type="person", - primary=False, - ) + # we'll use the latest bill_version for everything else + bill_version = data["items"][0] + bill.add_source(bill_api_url) + # subjects + for subj in bill_version["subjectindexes"]: try: - action_doc = self.get( - base_url + bill_version["action"][0]["link"], - verify=False, - ) - except scrapelib.HTTPError: + bill.add_subject(subj["primary"]) + except KeyError: pass - else: - actions = action_doc.json() - for action_row in reversed(actions["items"]): - actor = chamber_dict[action_row["chamber"]] - action_desc = action_row["description"] - try: - action_type = action_dict[action_row["actioncode"]] - except KeyError: - self.warning( - "Unknown action {desc} with code {code}." - " Add it to the action_dict" - ".".format( - desc=action_desc, code=action_row["actioncode"] - ) - ) - action_type = None - - date = dateutil.parser.parse(action_row["datetime"]) - if date.tzinfo is None: - date = self._tz.localize(date) - - date = "{:%Y-%m-%d}".format(date) - - action = bill.add_action( - action_desc, date, chamber=actor, classification=action_type - ) - committee = action_row.get("committee", "") - committee_id = action_row.get("cmte_lpid", "") - if committee_id: - committee = f'{action_row.get("chamber", "")} {committee} Committee'.strip() - action.add_related_entity( - committee, - entity_type="organization", - ) - - # attach documents gathered earlier - self.add_document(all_amendments, bill_id, "amendment", bill, base_url) - self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url) - self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url) - self.add_document(all_analysis, bill_id, "analysis", bill, base_url) - - # votes - vote_url = base_url + bill_version["votes"][0]["link"] try: - vote_doc = self.get(vote_url) - except scrapelib.HTTPError: - self.warning("Vote page not loading; skipping: {}".format(vote_url)) - yield bill - continue - votes = vote_doc.json() - yield from self.process_vote( - votes, - vote_url, - base_url, - bill, - legislators, - chamber_dict, - vote_results, + secondary_subj = subj["secondary"] + except KeyError: + secondary_subj = "" + if secondary_subj: + bill.add_subject(secondary_subj) + + # sponsors + sponsors = bill_version["sponsors"] + for sponsor in sponsors: + sponsor_name = self.get_sponsor_name(sponsor) + bill.add_sponsorship( + sponsor_name, + classification="primary", + entity_type="person", + primary=True, ) - vote_url = base_url - vote_url += bill_version["cmtevotes"][0]["link"] - try: - vote_doc = self.get(vote_url) - except scrapelib.HTTPError: - self.warning("Vote page not loading; skipping: {}".format(vote_url)) - yield bill - continue - votes = vote_doc.json() - yield from self.process_vote( - votes, - vote_url, - base_url, - bill, - legislators, - chamber_dict, - vote_results, + cosponsors = bill_version["cosponsors"] + for sponsor in cosponsors: + sponsor_name = self.get_sponsor_name(sponsor) + bill.add_sponsorship( + sponsor_name, + classification="cosponsor", + entity_type="person", + primary=False, ) - if data["items"][0]["effective_date"]: - effective_date = datetime.datetime.strptime( - data["items"][0]["effective_date"], "%Y-%m-%d" - ) - effective_date = self._tz.localize(effective_date) - # the OH website adds an action that isn't in the action list JSON. - # It looks like: - # Effective 7/6/18 - effective_date_oh = "{:%-m/%-d/%y}".format(effective_date) - effective_action = "Effective {}".format(effective_date_oh) - bill.add_action( - effective_action, - effective_date, - chamber="executive", - classification=["became-law"], - ) - - # we have never seen a veto or a disapprove, but they seem important. - # so we'll check and throw an error if we find one - # life is fragile. so are our scrapers. - if "veto" in bill_version: - veto_url = base_url + bill_version["veto"][0]["link"] - veto_json = self.get(veto_url).json() - if len(veto_json["items"]) > 0: - raise AssertionError( - "Whoa, a veto! We've never" - " gotten one before." - " Go write some code to deal" - " with it: {}".format(veto_url) + try: + action_doc = self.get( + self.short_base_url + bill_version["action"][0]["link"], + verify=False, + ) + except scrapelib.HTTPError: + pass + else: + actions = action_doc.json() + for action_row in reversed(actions["items"]): + actor = chamber_dict[action_row["chamber"]] + action_desc = action_row["description"] + try: + action_type = action_dict[action_row["actioncode"]] + except KeyError: + self.warning( + "Unknown action {desc} with code {code}." + " Add it to the action_dict" + ".".format(desc=action_desc, code=action_row["actioncode"]) ) + action_type = None - if "disapprove" in bill_version: - disapprove_url = base_url + bill_version["disapprove"][0]["link"] - disapprove_json = self.get(disapprove_url).json() - if len(disapprove_json["items"]) > 0: - raise AssertionError( - "Whoa, a disapprove! We've never" - " gotten one before." - " Go write some code to deal " - "with it: {}".format(disapprove_url) + date = dateutil.parser.parse(action_row["datetime"]) + if date.tzinfo is None: + date = self._tz.localize(date) + + date = "{:%Y-%m-%d}".format(date) + + action = bill.add_action( + action_desc, date, chamber=actor, classification=action_type + ) + committee = action_row.get("committee", "") + committee_id = action_row.get("cmte_lpid", "") + if committee_id: + committee = f'{action_row.get("chamber", "")} {committee} Committee'.strip() + action.add_related_entity( + committee, + entity_type="organization", ) + # attach documents gathered earlier + self.add_document(all_amendments, bill_id, "amendment", bill, self.base_url) + self.add_document(all_fiscals, bill_id, "fiscal", bill, self.base_url) + self.add_document(all_synopsis, bill_id, "synopsis", bill, self.base_url) + self.add_document(all_analysis, bill_id, "analysis", bill, self.base_url) + + # votes + vote_url = self.short_base_url + bill_version["votes"][0]["link"] + try: + vote_doc = self.get(vote_url) + except scrapelib.HTTPError: + self.warning("Vote page not loading; skipping: {}".format(vote_url)) yield bill + continue + votes = vote_doc.json() + yield from self.process_vote( + votes, + vote_url, + self.base_url, + bill, + legislators, + chamber_dict, + vote_results, + ) + + vote_url = self.short_base_url + bill_version["cmtevotes"][0]["link"] + try: + vote_doc = self.get(vote_url) + except scrapelib.HTTPError: + self.warning("Vote page not loading; skipping: {}".format(vote_url)) + yield bill + continue + votes = vote_doc.json() + yield from self.process_vote( + votes, + vote_url, + self.base_url, + bill, + legislators, + chamber_dict, + vote_results, + ) + + if data["items"][0]["effective_date"]: + effective_date = datetime.datetime.strptime( + data["items"][0]["effective_date"], "%Y-%m-%d" + ) + effective_date = self._tz.localize(effective_date) + # the OH website adds an action that isn't in the action list JSON. + # It looks like: + # Effective 7/6/18 + effective_date_oh = "{:%-m/%-d/%y}".format(effective_date) + effective_action = "Effective {}".format(effective_date_oh) + bill.add_action( + effective_action, + effective_date, + chamber="executive", + classification=["became-law"], + ) + + # we have never seen a veto or a disapprove, but they seem important. + # so we'll check and throw an error if we find one + # life is fragile. so are our scrapers. + if "veto" in bill_version: + veto_url = self.short_base_url + bill_version["veto"][0]["link"] + veto_json = self.get(veto_url).json() + if len(veto_json["items"]) > 0: + raise AssertionError( + "Whoa, a veto! We've never" + " gotten one before." + " Go write some code to deal" + " with it: {}".format(veto_url) + ) + + if "disapprove" in bill_version: + disapprove_url = ( + self.short_base_url + bill_version["disapprove"][0]["link"] + ) + disapprove_json = self.get(disapprove_url).json() + if len(disapprove_json["items"]) > 0: + raise AssertionError( + "Whoa, a disapprove! We've never" + " gotten one before." + " Go write some code to deal " + "with it: {}".format(disapprove_url) + ) + + yield bill def pages(self, base_url, first_page): page = self.get(first_page) @@ -379,7 +378,7 @@ def get_total_bills(self, session): # The /resolutions endpoint has included duplicate bills in its output, so use a set to filter duplicates bill_numbers_seen = set() total_bills = [] - bills_url = f"https://search-prod.lis.state.oh.us/solarapi/v1/general_assembly_{session}/bills" + bills_url = f"{self.base_url}bills" bill_data = self.get(bills_url, verify=False).json() if len(bill_data["items"]) == 0: self.logger.warning("No bills") @@ -392,7 +391,7 @@ def get_total_bills(self, session): f"Duplicate bill found in bills API response: {bill['number']}" ) - res_url = f"https://search-prod.lis.state.oh.us/solarapi/v1/general_assembly_{session}/resolutions" + res_url = f"{self.base_url}resolutions" res_data = self.get(res_url, verify=False).json() if len(res_data["items"]) == 0: self.logger.warning("No resolutions")