From 087c7683f6e8427236f1813cd5306af4bc7fb90a Mon Sep 17 00:00:00 2001 From: braykuka Date: Wed, 23 Oct 2024 17:26:47 +0200 Subject: [PATCH 01/14] IN: Vote pdf scraping issue --- scrapers/in/bills.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/scrapers/in/bills.py b/scrapers/in/bills.py index 96f3608149..d5853060a8 100644 --- a/scrapers/in/bills.py +++ b/scrapers/in/bills.py @@ -15,7 +15,7 @@ settings = dict(SCRAPELIB_TIMEOUT=600) -PROXY_BASE_URL = "https://in-proxy.openstates.org/" +PROXY_BASE_URL = "https://in-proxy.openstates.org" SCRAPE_WEB_VERSIONS = "INDIANA_SCRAPE_WEB_VERSIONS" in os.environ @@ -99,7 +99,6 @@ def _process_votes(self, rollcalls, bill_id, original_chamber, session): ) date_parts = lines[1].strip().split()[-3:] date_str = " ".join(date_parts).title() + " " + lines[2].strip() - vote_date = datetime.datetime.strptime(date_str, "%b %d, %Y %I:%M:%S %p") vote_date = pytz.timezone("America/Indiana/Indianapolis").localize( vote_date @@ -116,20 +115,26 @@ def _process_votes(self, rollcalls, bill_id, original_chamber, session): if res in line.upper(): passed = val break - if passed is None: raise AssertionError("Missing bill passage type") - motion = " ".join(lines[4].split()[:-2]) + motion = " ".join(lines[4].split()[:-2]).strip() + + for line_num in range(4, 8): + if "Yea " in lines[line_num]: + break + if line_num > 4: + motion = " ".join(lines[3].split()).strip() + if "Roll Call" in motion: + motion = motion.split(":")[-1].strip() try: - yeas = int(lines[4].split()[-1]) - nays = int(lines[5].split()[-1]) - excused = int(lines[6].split()[-1]) - not_voting = int(lines[7].split()[-1]) + yeas = int(lines[line_num].split()[-1]) + nays = int(lines[line_num + 1].split()[-1]) + excused = int(lines[line_num + 2].split()[-1]) + not_voting = int(lines[line_num + 3].split()[-1]) except ValueError: self.logger.warning("Vote format is weird, skipping") continue - vote = VoteEvent( chamber=chamber, legislative_session=session, @@ -390,7 +395,6 @@ def scrape(self, session=None): for b in all_pages: bill_id = b["billName"] disp_bill_id = b["displayName"] - bill_link = b["link"] api_source = api_base_url + bill_link try: From 243a30948ae526f24383c8c12f81b238c5033bee Mon Sep 17 00:00:00 2001 From: braykuka Date: Wed, 23 Oct 2024 18:09:56 +0200 Subject: [PATCH 02/14] fix: format issue --- scrapers/in/bills.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/scrapers/in/bills.py b/scrapers/in/bills.py index d5853060a8..afafe217ed 100644 --- a/scrapers/in/bills.py +++ b/scrapers/in/bills.py @@ -91,6 +91,7 @@ def _process_votes(self, rollcalls, bill_id, original_chamber, session): continue text = convert_pdf(path, "text").decode("utf-8") + lines = text.split("\n") os.remove(path) @@ -118,23 +119,26 @@ def _process_votes(self, rollcalls, bill_id, original_chamber, session): if passed is None: raise AssertionError("Missing bill passage type") - motion = " ".join(lines[4].split()[:-2]).strip() - for line_num in range(4, 8): if "Yea " in lines[line_num]: break - if line_num > 4: - motion = " ".join(lines[3].split()).strip() - if "Roll Call" in motion: - motion = motion.split(":")[-1].strip() - try: - yeas = int(lines[line_num].split()[-1]) - nays = int(lines[line_num + 1].split()[-1]) - excused = int(lines[line_num + 2].split()[-1]) - not_voting = int(lines[line_num + 3].split()[-1]) - except ValueError: + motion = " ".join(lines[line_num].split()[:-2]).strip() + + yeas, nays, excused, not_voting = [""] * 4 + for line in lines[4:10]: + if "Yea " in line: + yeas = int(line.split()[-1]) + elif "Nay" in line: + nays = int(line.split()[-1]) + elif "Excused " in line: + excused = int(line.split()[-1]) + elif "Not Voting " in line: + not_voting = int(line.split()[-1]) + + if any(val == "" for val in [yeas, nays, excused, not_voting]): self.logger.warning("Vote format is weird, skipping") continue + vote = VoteEvent( chamber=chamber, legislative_session=session, From 8631c400b8079d19b5543fe8af3ef1cd9a5a28a5 Mon Sep 17 00:00:00 2001 From: braykuka Date: Thu, 24 Oct 2024 19:51:41 +0200 Subject: [PATCH 03/14] IN: update bills scraper with new api --- scrapers/in/__init__.py | 6 +- scrapers/in/apiclient.py | 29 +++++---- scrapers/in/bills.py | 128 +++++++-------------------------------- 3 files changed, 40 insertions(+), 123 deletions(-) diff --git a/scrapers/in/__init__.py b/scrapers/in/__init__.py index 753e4694dc..302de2d1bf 100644 --- a/scrapers/in/__init__.py +++ b/scrapers/in/__init__.py @@ -175,10 +175,10 @@ def get_session_list(self): apikey = os.environ["INDIANA_API_KEY"] useragent = os.getenv("USER_AGENT", "openstates") headers = { - "Authorization": apikey, + "x-api-key": apikey, "Accept": "application/json", "User-Agent": useragent, } - resp = requests.get("https://api.iga.in.gov/sessions", headers=headers) + resp = requests.get("https://beta-api.iga.in.gov", headers=headers) resp.raise_for_status() - return [session["name"] for session in resp.json()["items"]] + return [session["name"] for session in resp.json()["sessions"]] diff --git a/scrapers/in/apiclient.py b/scrapers/in/apiclient.py index 09a9d5e046..7a7e936d74 100644 --- a/scrapers/in/apiclient.py +++ b/scrapers/in/apiclient.py @@ -5,7 +5,7 @@ """ API key must be passed as a header. You need the following headers to get JSON: -Authorization = your_apikey +x-api-key = your_apikey Accept = "application/json" If you're trying to hit api links through your browser you @@ -48,26 +48,29 @@ def wrapped(self, *args, **kwargs): class ApiClient(object): """ - docs: http://docs.api.iga.in.gov/ + docs: https://docs.beta-api.iga.in.gov """ - root = "https://api.iga.in.gov/" + root = "https://beta-api.iga.in.gov" resources = dict( - sessions="/sessions", + sessions="/", subjects="/{session}/subjects", chambers="/{session}/chambers", bills="/{session}/bills", - bill="/{session}/bills/{bill_id}", + bill="{bill_link}", chamber_bills="/{session}/chambers/{chamber}/bills", - # note that rollcall_id has to be pulled off the URL, it's NOT the rollcall_number - rollcalls="/{session}/rollcalls/{rollcall_id}", - bill_actions="/{session}/bills/{bill_id}/actions", + rollcalls="/{session}/rollcalls", + rollcall="{rollcall_link}", + meetings="/{session}/meetings", + meeting="{meeting_link}", + bill_actions="{action_link}", committees="/{session}/committees", - committee="/{committee_link}", + committee="{committee_link}", legislators="/{session}/legislators", - legislator="/{session}/legislators/{legislator_id}", + legislator="{legislator_link}", chamber_legislators="/{session}/chambers/{chamber}/legislators", bill_version="/{session}/bills/{bill_id}/versions/{version_id}", + fiscal_notes="/{session}/fiscal-notes", ) def __init__(self, scraper): @@ -78,7 +81,7 @@ def __init__(self, scraper): @check_response def geturl(self, url): headers = {} - headers["Authorization"] = self.apikey + headers["x-api-key"] = self.apikey headers["Accept"] = "application/json" headers["User-Agent"] = self.user_agent self.scraper.info("Api GET next page: %r, %r" % (url, headers)) @@ -87,7 +90,7 @@ def geturl(self, url): @check_response def get_relurl(self, url): headers = {} - headers["Authorization"] = self.apikey + headers["x-api-key"] = self.apikey headers["Accept"] = "application/json" headers["User-Agent"] = self.user_agent url = urljoin(self.root, url) @@ -113,7 +116,7 @@ def get( requests_args = requests_args or () requests_kwargs = requests_kwargs or {} headers = requests_kwargs.get("headers", {}) - headers["Authorization"] = self.apikey + headers["x-api-key"] = self.apikey headers["Accept"] = "application/json" headers["User-Agent"] = self.user_agent requests_kwargs["headers"] = headers diff --git a/scrapers/in/bills.py b/scrapers/in/bills.py index afafe217ed..213e859a3f 100644 --- a/scrapers/in/bills.py +++ b/scrapers/in/bills.py @@ -1,5 +1,6 @@ import re import datetime +from urllib.parse import urljoin import lxml import os from collections import OrderedDict @@ -184,7 +185,7 @@ def _process_votes(self, rollcalls, bill_id, original_chamber, session): yield vote - def deal_with_version(self, version, bill, bill_id, chamber, session): + def deal_with_latest_version(self, version, bill, api_base_url, session): # documents docs = OrderedDict() docs["Committee Amendment"] = version.get("cmte_amendments", []) @@ -202,7 +203,7 @@ def deal_with_version(self, version, bill, bill_id, chamber, session): doc_list = docs[doc_type] for doc in doc_list: title = "{doc_type}: {name}".format(doc_type=doc_type, name=doc["name"]) - link = f"https://iga.in.gov/pdf-documents/{self.session_prefixes[session]}{doc['link']}.pdf" + link = f"{api_base_url}{doc['link']}?format=pdf" if link not in urls_seen: urls_seen.append(link) bill.add_document_link( @@ -228,7 +229,7 @@ def deal_with_version(self, version, bill, bill_id, chamber, session): if version_chamber != api_name_chamber[1]: versions_match = False - link = f"https://iga.in.gov/pdf-documents/{self.session_prefixes[session]}{version['link']}.pdf" + link = f"{api_base_url}{doc['link']}?format=pdf" # if the chambers don't match, swap the chamber on version name # ex: Engrossed Senate Bill (S) to Engrossed Senate Bill (H) name = ( @@ -260,83 +261,6 @@ def deal_with_version(self, version, bill, bill_id, chamber, session): note=name, url=link, media_type="application/pdf", date=update_date ) - def scrape_web_versions(self, session, bill, bill_id): - # found via web inspector of the requests to - # https://iga.in.gov/documents/{doc_id} - # the web url for downloading a doc is https://iga.in.gov/documents/{doc_id}/download - # where doc_id is the data-myiga-actiondata attribute of the link - # this id isn't available in the API, so we have to scrape it - - # IN Web requests use cloudflare, which requires a User-Agent to be set - headers = { - "User-Agent": "openstates.org", - } - - bill_url = self._get_bill_url(session, bill_id) - page = self.get(bill_url, verify=False, headers=headers).content - page = lxml.html.fromstring(page) - - # each printing has its version, fiscalnotes, and amendments in an
  • - for version_section in page.xpath('//div[@id="bill-versions"]/div/ul/li'): - version_name = "" - for link in version_section.xpath( - 'div/div[1]/a[contains(@data-myiga-action,"pdfviewer.loadpdf") and contains(@class,"accordion-header")]' - ): - doc_id = link.xpath("@data-myiga-actiondata")[0] - version_name = link.xpath("@title")[0] - # found via web inspector of the requests to - # http://iga.in.gov/documents/{doc_id} - download_link = f"https://iga.in.gov/documents/{doc_id}/download" - bill.add_version_link( - version_name, - download_link, - media_type="application/pdf", - on_duplicate="ignore", - ) - self.info(f"Version {doc_id} {version_name} {download_link}") - - for link in version_section.xpath( - './/li[contains(@class,"fiscalnote-item")]/a[contains(@data-myiga-action,"pdfviewer.loadpdf")][1]' - ): - doc_id = link.xpath("@data-myiga-actiondata")[0] - document_title = link.xpath("div[1]/text()")[0].strip() - document_name = "{} {}".format(version_name, document_title) - download_link = f"https://iga.in.gov/documents/{doc_id}/download" - bill.add_document_link( - document_name, - download_link, - media_type="application/pdf", - on_duplicate="ignore", - ) - self.info(f"Fiscal Note {doc_id} {document_name} {download_link}") - - for link in version_section.xpath( - './/li[contains(@class,"amendment-item")]/a[contains(@data-myiga-action,"pdfviewer.loadpdf")][1]' - ): - doc_id = link.xpath("@data-myiga-actiondata")[0] - document_title = link.xpath("div[1]/text()")[0].strip() - document_name = "{} {}".format(version_name, document_title) - download_link = f"https://iga.in.gov/documents/{doc_id}/download" - # If an amendment has passed, add it as a version, otherwise as a document - if "passed" in document_title.lower(): - bill.add_version_link( - document_name, - download_link, - media_type="application/pdf", - on_duplicate="ignore", - ) - self.info( - f"Passed Amendment {doc_id} {document_name} {download_link}" - ) - else: - bill.add_document_link( - document_name, - download_link, - media_type="application/pdf", - on_duplicate="ignore", - ) - self.info(f"Amendment {doc_id} {document_name} {download_link}") - def scrape(self, session=None): self._bill_prefix_map = { "HB": {"type": "bill", "url_segment": "bills/house"}, @@ -377,8 +301,6 @@ def scrape(self, session=None): }, } - api_base_url = "https://api.iga.in.gov" - # ah, indiana. it's really, really hard to find # pdfs in their web interface. Super easy with # the api, but a key needs to be passed @@ -388,6 +310,8 @@ def scrape(self, session=None): # using our api key for pdf document access. client = ApiClient(self) + api_base_url = client.root + r = client.get("bills", session=session) all_pages = client.unpaginate(r) @@ -400,22 +324,21 @@ def scrape(self, session=None): bill_id = b["billName"] disp_bill_id = b["displayName"] bill_link = b["link"] - api_source = api_base_url + bill_link + api_source = urljoin(api_base_url, bill_link) + try: - bill_json = client.get("bill", session=session, bill_id=bill_id.lower()) + bill_json = client.get("bill", session=session, bill_link=bill_link) except scrapelib.HTTPError: self.logger.warning("Bill could not be accessed. Skipping.") continue - - title = bill_json["description"] - if title == "NoneNone": - title = None # sometimes description is blank # if that's the case, we can check to see if # the latest version has a short description + title = bill_json["description"] + if "NoneNone" in title: + title = None if not title: title = bill_json["latestVersion"]["shortDescription"] - # and if that doesn't work, use the bill_id but throw a warning if not title: title = bill_id @@ -450,11 +373,11 @@ def scrape(self, session=None): # actions action_link = bill_json["actions"]["link"] - api_source = api_base_url + action_link + api_source = urljoin(api_base_url, action_link) try: actions = client.get( - "bill_actions", session=session, bill_id=bill_id.lower() + "bill_actions", session=session, action_link=action_link ) actions = client.unpaginate(actions) except scrapelib.HTTPError: @@ -550,21 +473,12 @@ def scrape(self, session=None): media_type="application/pdf", on_duplicate="ignore", ) - # # put this behind a flag 2021-03-18 (openstates/issues#291) - # if not SCRAPE_WEB_VERSIONS: - # # versions - # self.deal_with_version( - # bill_json["latestVersion"], bill, bill_id, original_chamber, session - # ) - # for version in bill_json["versions"][::-1]: - # self.deal_with_version( - # version, - # bill, - # bill_id, - # original_chamber, - # session, - # ) - # else: - # self.scrape_web_versions(session, bill, bill_id) + + self.deal_with_latest_version( + bill_json["latestVersion"], + bill, + api_base_url, + session, + ) yield bill From 6f5c399a498ad0d7168e00f9ddf63ee6fc6f29be Mon Sep 17 00:00:00 2001 From: braykuka Date: Sun, 27 Oct 2024 19:55:29 +0100 Subject: [PATCH 04/14] IN: bills and event scraper rewrite 2025 --- scrapers/in/events.py | 121 ++++++++++++++---------------------------- 1 file changed, 41 insertions(+), 80 deletions(-) diff --git a/scrapers/in/events.py b/scrapers/in/events.py index 87946d01cd..df28f459ea 100644 --- a/scrapers/in/events.py +++ b/scrapers/in/events.py @@ -1,13 +1,12 @@ +import json import logging -import os from datetime import date +from urllib.parse import urljoin import dateutil.parser -from http import HTTPStatus import pytz -import requests -import time from openstates.scrape import Scraper, Event +from .apiclient import ApiClient from .utils import add_space from openstates.exceptions import EmptyScrape @@ -22,116 +21,78 @@ class INEventScraper(Scraper): "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/108.0.0.0 Safari/537.36" # noqa } - base_url = "https://api.iga.in.gov" + base_url = "https://beta-api.iga.in.gov" session = date.today().year - _session = requests.Session() - _retry_codes = ( - HTTPStatus.TOO_MANY_REQUESTS, - HTTPStatus.INTERNAL_SERVER_ERROR, - HTTPStatus.BAD_GATEWAY, - HTTPStatus.SERVICE_UNAVAILABLE, - HTTPStatus.GATEWAY_TIMEOUT, - ) - - def _in_request(self, url): - """ - Make request to INDIANA API - """ - apikey = os.environ["INDIANA_API_KEY"] - useragent = os.getenv("USER_AGENT", self.cf_headers["User-Agent"]) - headers = { - "Authorization": apikey, - "Accept": "application/json", - "User-Agent": useragent, - } - res = self._session.get(url, headers=headers) - attempts = 0 - while attempts < 5 and res.status_code in self._retry_codes: - log.warning( - f"Got rate-limiting error response {res.status_code} for {url}. Retrying..." - ) - attempts += 1 - time.sleep(15) - res = self._session.get(url, headers=headers) - if res.status_code == 520: - self.logger.warning(f"Got CloudFlare error for {url}. Skipping...") - return {} - res.raise_for_status() - return res + + def __init__(self, *args, **kwargs): + self.apiclient = ApiClient(self) + super().__init__(*args, **kwargs) def scrape(self): - res = self._in_request(f"{self.base_url}/{self.session}/standing-committees") - if not res: + response = self.apiclient.get("meetings", session=self.session) + meetings = response["meetings"] + if len(meetings["items"]) == 0: raise EmptyScrape - for committee in res.json()["items"]: - committee_path = committee["link"].replace( - "standing-committees", "committees" + for item in meetings["items"]: + meeting = self.apiclient.get( + "meeting", session=self.session, meeting_link=item["link"] ) - url = f"{self.base_url}{committee_path}/meetings" - for event in self.extract_committee_events(url, committee): - yield event - - def extract_committee_events(self, url, committee): - - res = self._in_request(url) - if not res: - return [] - event_names = set() - committee_name = f"{committee['chamber']} {committee['name']}" - for meeting in res.json()["items"]: + if meeting["cancelled"] != "False": continue - link = meeting["link"] + committee = meeting["committee"] + + link = urljoin(self.base_url, meeting["link"]) _id = link.split("/")[-1] - extra_details = self._in_request(f"{self.base_url}{link}").json() date = meeting["meetingdate"].replace(" ", "") time = meeting["starttime"] if time: time = time.replace(" ", "") - location = ( - meeting["location"] - or extra_details.get("location", None) - or "See Agenda" - ) + when = dateutil.parser.parse(f"{date} {time}") + all_day = False + else: + when = dateutil.parser.parse(date).date() + all_day = True + when = self._tz.localize(when) + + location = meeting["location"] or "See Agenda" + video_url = ( f"https://iga.in.gov/legislative/{self.session}/meeting/watchlive/{_id}" ) - try: - when = dateutil.parser.parse(f"{date} {time}") - except dateutil.parser.ParserError: - log.info(f"Could not parse date: {date} {time}") - when = dateutil.parser.parse(date) - when = self._tz.localize(when) event_name = f"{committee['chamber']}#{committee['name']}#{location}#{when}" - if event_name in event_names: - self.warning(f"Duplicate event {event_name}") - continue - event_names.add(event_name) + event = Event( - name=committee_name, + name=committee["name"], start_date=when, + all_day=all_day, location_name=location, classification="committee-meeting", ) event.dedupe_key = event_name - event.add_source(url, note="API document") - event.add_source(f"{self.base_url}{link}", note="API details") + event.add_source(link, note="API details") name_slug = committee["name"].lower().replace(" ", "-") event.add_source( f"https://iga.in.gov/{self.session}/committees/{committee['chamber'].lower()}/{name_slug}", note="Committee Schedule", ) - event.add_participant(committee_name, type="committee", note="host") + event.add_participant(committee["name"], type="committee", note="host") event.add_media_link("Video of Hearing", video_url, media_type="text/html") agenda = event.add_agenda_item("Bills under consideration") - for item in extra_details.get("agenda", []): - if not item.get("bill", None): + + agendas = meeting.get("agenda") + if type(agendas) == str: + agendas = json.loads(meeting.get("agenda")) + + for agenda_item in agendas: + if not agenda_item.get("bill", None): continue - bill_id = item["bill"].get("billName") + bill_id = agenda_item["bill"].get("billName") bill_id = add_space(bill_id) agenda.add_bill(bill_id) + yield event From 97b6a4dc8a7de0a8ca0db643fe44e8b15cb10ca8 Mon Sep 17 00:00:00 2001 From: braykuka Date: Sun, 27 Oct 2024 19:59:18 +0100 Subject: [PATCH 05/14] fix lint issue --- scrapers/in/bills.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapers/in/bills.py b/scrapers/in/bills.py index 213e859a3f..4074bca4a3 100644 --- a/scrapers/in/bills.py +++ b/scrapers/in/bills.py @@ -1,7 +1,6 @@ import re import datetime from urllib.parse import urljoin -import lxml import os from collections import OrderedDict From 50bc5bef3f068c355443793472651910111f7efe Mon Sep 17 00:00:00 2001 From: braykuka Date: Sun, 27 Oct 2024 20:02:22 +0100 Subject: [PATCH 06/14] fix lint issue --- scrapers/in/events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapers/in/events.py b/scrapers/in/events.py index df28f459ea..6f9eefe352 100644 --- a/scrapers/in/events.py +++ b/scrapers/in/events.py @@ -85,7 +85,7 @@ def scrape(self): agenda = event.add_agenda_item("Bills under consideration") agendas = meeting.get("agenda") - if type(agendas) == str: + if type(agendas) is str: agendas = json.loads(meeting.get("agenda")) for agenda_item in agendas: From 729087e262dc0598f327570d26cabdf7031a4276 Mon Sep 17 00:00:00 2001 From: braykuka Date: Mon, 28 Oct 2024 17:51:11 +0100 Subject: [PATCH 07/14] add committe --- scrapers/in/committees.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/scrapers/in/committees.py b/scrapers/in/committees.py index 1fee563e1f..cb55a50945 100644 --- a/scrapers/in/committees.py +++ b/scrapers/in/committees.py @@ -14,18 +14,18 @@ class INCommitteeScraper(Scraper): def process_special_members(self, comm, comm_json, role_name): role_dict = { "chair": "Chair", + "co-chairs": "Chair", "viceChair": "Vice Chair", "rankingMinMember": "Ranking Minority Member", } - try: - mem = comm_json[role_name] - except KeyError: - return - if mem: - person = mem["firstName"] + " " + mem["lastName"] + + members = [] + for member in comm_json[role_name]: + person = member["firstName"] + " " + member["lastName"] comm.add_member(person, role=role_dict[role_name]) - return person - return None + members.append(person) + + return members def get_subcommittee_info(self, session): # api gives NO way of finding out who owns @@ -54,7 +54,7 @@ def get_subcommittee_info(self, session): def scrape(self, session): subcomms = self.get_subcommittee_info(session) - api_base_url = "https://api.iga.in.gov" + api_base_url = "https://beta-api.iga.in.gov" html_base_url = "http://iga.in.gov/legislative/{}/committees/".format(session) client = ApiClient(self) r = client.get("committees", session=session) @@ -74,7 +74,7 @@ def scrape(self, session): continue try: chamber = comm_json["chamber"]["name"] - except KeyError: + except TypeError: chamber = "joint" else: if chamber == "Senate": @@ -109,13 +109,18 @@ def scrape(self, session): classification="committee", ) - chair = self.process_special_members(comm, comm_json, "chair") - vicechair = self.process_special_members(comm, comm_json, "viceChair") - ranking = self.process_special_members(comm, comm_json, "rankingMinMember") + chairs = self.process_special_members(comm, comm_json, "chair") + cochairs = self.process_special_members(comm, comm_json, "co-chairs") + vicechairs = self.process_special_members(comm, comm_json, "viceChair") + rankingMinMembers = self.process_special_members( + comm, comm_json, "rankingMinMember" + ) # leadership is also listed in membership # so we have to make sure we haven't seen them yet - comm_members = [m for m in [chair, vicechair, ranking] if m] + comm_members = [ + m for m in [*chairs, *cochairs, *vicechairs, *rankingMinMembers] if m + ] for mem in comm_json["members"]: mem_name = mem["firstName"] + " " + mem["lastName"] From 0ac61c90360360ba76b2c91b24fc7e7d64be9af1 Mon Sep 17 00:00:00 2001 From: Jesse Mortenson Date: Mon, 28 Oct 2024 12:26:30 -0600 Subject: [PATCH 08/14] IN: do timezline localization only for events that have times --- scrapers/in/events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapers/in/events.py b/scrapers/in/events.py index 6f9eefe352..324f8675df 100644 --- a/scrapers/in/events.py +++ b/scrapers/in/events.py @@ -52,11 +52,11 @@ def scrape(self): if time: time = time.replace(" ", "") when = dateutil.parser.parse(f"{date} {time}") + when = self._tz.localize(when) all_day = False else: when = dateutil.parser.parse(date).date() all_day = True - when = self._tz.localize(when) location = meeting["location"] or "See Agenda" From 5773420ebbcc5389afa8718a0f66d2e62545e5d0 Mon Sep 17 00:00:00 2001 From: braykuka Date: Wed, 30 Oct 2024 20:41:44 +0100 Subject: [PATCH 09/14] fix: review changes --- scrapers/in/__init__.py | 1 + scrapers/in/apiclient.py | 31 ++++++++++++++++- scrapers/in/bills.py | 25 ++++++++------ scrapers/in/committees.py | 12 ++++--- scrapers/in/events.py | 73 +++++++++++++++++++++++++++++++-------- scrapers/in/utils.py | 2 +- 6 files changed, 111 insertions(+), 33 deletions(-) diff --git a/scrapers/in/__init__.py b/scrapers/in/__init__.py index 302de2d1bf..299867b480 100644 --- a/scrapers/in/__init__.py +++ b/scrapers/in/__init__.py @@ -3,6 +3,7 @@ from openstates.scrape import State from .bills import INBillScraper from .events import INEventScraper +from .committees import INCommitteeScraper settings = dict(SCRAPELIB_TIMEOUT=600) diff --git a/scrapers/in/apiclient.py b/scrapers/in/apiclient.py index 7a7e936d74..083c0749ab 100644 --- a/scrapers/in/apiclient.py +++ b/scrapers/in/apiclient.py @@ -1,7 +1,9 @@ import os +import re import time from urllib.parse import urljoin import functools +import requests """ API key must be passed as a header. You need the following headers to get JSON: @@ -54,6 +56,7 @@ class ApiClient(object): root = "https://beta-api.iga.in.gov" resources = dict( sessions="/", + session="/{session}", subjects="/{session}/subjects", chambers="/{session}/chambers", bills="/{session}/bills", @@ -71,6 +74,7 @@ class ApiClient(object): chamber_legislators="/{session}/chambers/{chamber}/legislators", bill_version="/{session}/bills/{bill_id}/versions/{version_id}", fiscal_notes="/{session}/fiscal-notes", + document="{doc_link}", ) def __init__(self, scraper): @@ -78,6 +82,31 @@ def __init__(self, scraper): self.apikey = os.environ["INDIANA_API_KEY"] self.user_agent = os.getenv("USER_AGENT", "openstates") + def get_session_no(self, session): + session_no = "" + headers = {} + headers["x-api-key"] = self.apikey + headers["Accept"] = "application/json" + headers["User-Agent"] = self.user_agent + url = urljoin(self.root, f"/{session}") + resp = requests.get(url, headers=headers).json() + session_no_regex = re.search(r"Session\s+(\d+).+", resp["name"]) + + if session_no_regex: + session_no = session_no_regex.group(1) + + return session_no + + def get_document_url(self, url): + headers = {} + headers["x-api-key"] = self.apikey + headers["Accept"] = "application/pdf" + headers["User-Agent"] = self.user_agent + url = urljoin(self.root, url) + resp = requests.get(url, headers=headers, allow_redirects=False) + if "Location" in resp.headers: + return resp.headers["Location"] + @check_response def geturl(self, url): headers = {} @@ -91,7 +120,7 @@ def geturl(self, url): def get_relurl(self, url): headers = {} headers["x-api-key"] = self.apikey - headers["Accept"] = "application/json" + headers["Accept"] = "application/pdf" headers["User-Agent"] = self.user_agent url = urljoin(self.root, url) self.scraper.info("Api GET: %r, %r" % (url, headers)) diff --git a/scrapers/in/bills.py b/scrapers/in/bills.py index 4074bca4a3..4011b8a44a 100644 --- a/scrapers/in/bills.py +++ b/scrapers/in/bills.py @@ -26,13 +26,6 @@ class INBillScraper(Scraper): _tz = pytz.timezone("US/Eastern") - # prefixes for PDF files for session - session_prefixes = { - "2024": "123", - "2023": "123", - "2022": "122", - } - def _get_bill_id_components(self, bill_id): bill_prefix = "".join([c for c in bill_id if c.isalpha()]) bill_number = "".join([c for c in bill_id if c.isdigit()]).lstrip("0") @@ -184,7 +177,12 @@ def _process_votes(self, rollcalls, bill_id, original_chamber, session): yield vote - def deal_with_latest_version(self, version, bill, api_base_url, session): + def deal_with_latest_version( + self, + version, + bill, + api_base_url, + ): # documents docs = OrderedDict() docs["Committee Amendment"] = version.get("cmte_amendments", []) @@ -310,7 +308,7 @@ def scrape(self, session=None): client = ApiClient(self) api_base_url = client.root - + self.session_no = client.get_session_no(session) r = client.get("bills", session=session) all_pages = client.unpaginate(r) @@ -323,6 +321,7 @@ def scrape(self, session=None): bill_id = b["billName"] disp_bill_id = b["displayName"] bill_link = b["link"] + api_source = urljoin(api_base_url, bill_link) try: @@ -330,6 +329,11 @@ def scrape(self, session=None): except scrapelib.HTTPError: self.logger.warning("Bill could not be accessed. Skipping.") continue + + # vehicle bill + if len(list(bill_json.keys())) == 0: + self.logger.warning("Vehicle Bill: {}".format(bill_id)) + continue # sometimes description is blank # if that's the case, we can check to see if # the latest version has a short description @@ -465,7 +469,7 @@ def scrape(self, session=None): # note there are a number of links in the API response that won't work with just a browser, they need an api key # https://iga.in.gov/pdf-documents/123/2024/house/resolutions/HC0001/HC0001.01.INTR.pdf category = "resolutions" if "resolution" in bill_type else "bills" - url = f"https://iga.in.gov/pdf-documents/{self.session_prefixes[session]}/{bill_json['year']}/{bill_json['originChamber']}/{category}/{v['billName']}/{v['printVersionName']}.pdf" + url = f"https://iga.in.gov/pdf-documents/{self.session_no}/{bill_json['year']}/{bill_json['originChamber']}/{category}/{v['billName']}/{v['printVersionName']}.pdf" bill.add_version_link( v["stageVerbose"], url, @@ -477,7 +481,6 @@ def scrape(self, session=None): bill_json["latestVersion"], bill, api_base_url, - session, ) yield bill diff --git a/scrapers/in/committees.py b/scrapers/in/committees.py index cb55a50945..bc1b230701 100644 --- a/scrapers/in/committees.py +++ b/scrapers/in/committees.py @@ -14,13 +14,13 @@ class INCommitteeScraper(Scraper): def process_special_members(self, comm, comm_json, role_name): role_dict = { "chair": "Chair", - "co-chairs": "Chair", + "co_chairs": "Chair", "viceChair": "Vice Chair", "rankingMinMember": "Ranking Minority Member", } members = [] - for member in comm_json[role_name]: + for member in comm_json.get(role_name, []): person = member["firstName"] + " " + member["lastName"] comm.add_member(person, role=role_dict[role_name]) members.append(person) @@ -74,7 +74,7 @@ def scrape(self, session): continue try: chamber = comm_json["chamber"]["name"] - except TypeError: + except Exception: chamber = "joint" else: if chamber == "Senate": @@ -85,9 +85,11 @@ def scrape(self, session): raise AssertionError("Unknown committee chamber {}".format(chamber)) name = comm_json["name"] + if not name: + continue try: owning_comm = subcomms[name] - except KeyError: + except Exception: name = name.replace("Statutory Committee on", "").strip() comm = Organization( name=name, chamber=chamber, classification="committee" @@ -110,7 +112,7 @@ def scrape(self, session): ) chairs = self.process_special_members(comm, comm_json, "chair") - cochairs = self.process_special_members(comm, comm_json, "co-chairs") + cochairs = self.process_special_members(comm, comm_json, "co_chairs") vicechairs = self.process_special_members(comm, comm_json, "viceChair") rankingMinMembers = self.process_special_members( comm, comm_json, "rankingMinMember" diff --git a/scrapers/in/events.py b/scrapers/in/events.py index 324f8675df..ac75045e14 100644 --- a/scrapers/in/events.py +++ b/scrapers/in/events.py @@ -1,5 +1,6 @@ import json import logging +import re from datetime import date from urllib.parse import urljoin @@ -29,6 +30,7 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def scrape(self): + session_no = self.apiclient.get_session_no(self.session) response = self.apiclient.get("meetings", session=self.session) meetings = response["meetings"] if len(meetings["items"]) == 0: @@ -46,7 +48,20 @@ def scrape(self): link = urljoin(self.base_url, meeting["link"]) _id = link.split("/")[-1] - + committee_name = ( + committee["name"] + .replace(",", "") + .replace("Committee on", "Committee") + .strip() + ) + committee_type = ( + "conference" + if "Conference" in committee["name"] + else ("standing" if committee["chamber"] else "interim") + ) + committee_chamber = ( + committee["chamber"].lower() if committee["chamber"] else "universal" + ) date = meeting["meetingdate"].replace(" ", "") time = meeting["starttime"] if time: @@ -63,11 +78,10 @@ def scrape(self): video_url = ( f"https://iga.in.gov/legislative/{self.session}/meeting/watchlive/{_id}" ) - - event_name = f"{committee['chamber']}#{committee['name']}#{location}#{when}" + event_name = f"{committee['chamber']}#{committee_name}#{location}#{when}" event = Event( - name=committee["name"], + name=committee_name, start_date=when, all_day=all_day, location_name=location, @@ -75,24 +89,53 @@ def scrape(self): ) event.dedupe_key = event_name event.add_source(link, note="API details") - name_slug = committee["name"].lower().replace(" ", "-") + name_slug = committee_name.lower().replace(" ", "-") + name_slug = re.sub("[^a-zA-Z0-9]+", "-", committee_name.lower()) + document_url = f"https://iga.in.gov/pdf-documents/{session_no}/{self.session}/{committee_chamber}/committees/{committee_type}/{name_slug}/{_id}/meeting.pdf" + event.add_source( - f"https://iga.in.gov/{self.session}/committees/{committee['chamber'].lower()}/{name_slug}", + f"https://iga.in.gov/{self.session}/committees/{committee['chamber'].lower() or 'interim'}/{name_slug}", note="Committee Schedule", ) - event.add_participant(committee["name"], type="committee", note="host") + event.add_participant(committee_name, type="committee", note="host") + event.add_document( + "Meeting Agenda", document_url, media_type="applicaiton/pdf" + ) event.add_media_link("Video of Hearing", video_url, media_type="text/html") - agenda = event.add_agenda_item("Bills under consideration") - agendas = meeting.get("agenda") + agendas = meeting["agenda"] if type(agendas) is str: - agendas = json.loads(meeting.get("agenda")) + agendas = json.loads(meeting["agenda"]) + if agendas: + agenda = event.add_agenda_item("Bills under consideration") for agenda_item in agendas: - if not agenda_item.get("bill", None): - continue - bill_id = agenda_item["bill"].get("billName") - bill_id = add_space(bill_id) - agenda.add_bill(bill_id) + if agenda_item.get("bill", None): + bill_id = agenda_item["bill"].get("billName") + bill_id = add_space(bill_id) + agenda.add_bill(bill_id) + else: + agenda.add_subject(agenda_item["description"]) + + for exhibit in meeting.get("exhibits"): + exhibit_pdf_url = self.apiclient.get_document_url( + exhibit["pdfDownloadLink"] + ) + self.logger.info(exhibit["pdfDownloadLink"]) + if exhibit_pdf_url: + event.add_document( + exhibit["description"], + exhibit_pdf_url, + media_type="application/pdf", + ) + + for minute in meeting.get("minutes"): + if minute["link"]: + minute_pdf_url = f"https://iga.in.gov/pdf-documents/{session_no}/{self.session}/{committee_chamber}/committees/{committee_type}/{name_slug}/{_id}/{_id}_minutes.pdf" + event.add_document( + "Meeting Minutes", + minute_pdf_url, + media_type="application/pdf", + ) yield event diff --git a/scrapers/in/utils.py b/scrapers/in/utils.py index 6821c0f935..cc624994e4 100644 --- a/scrapers/in/utils.py +++ b/scrapers/in/utils.py @@ -52,7 +52,7 @@ def add_space(text): # Slice the string to get the number and text parts alpha = text[:index] - number = text[index:] + number = text[index:].lstrip("0") new_string = f"{alpha} {number}" From bbd2c6b7da8bf2435fe11ef93f4c928e37cb4076 Mon Sep 17 00:00:00 2001 From: braykuka Date: Thu, 31 Oct 2024 08:30:35 +0100 Subject: [PATCH 10/14] fix: small issue --- scrapers/in/bills.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapers/in/bills.py b/scrapers/in/bills.py index 4011b8a44a..6e26047ffa 100644 --- a/scrapers/in/bills.py +++ b/scrapers/in/bills.py @@ -226,7 +226,7 @@ def deal_with_latest_version( if version_chamber != api_name_chamber[1]: versions_match = False - link = f"{api_base_url}{doc['link']}?format=pdf" + link = f"{api_base_url}{version['link']}?format=pdf" # if the chambers don't match, swap the chamber on version name # ex: Engrossed Senate Bill (S) to Engrossed Senate Bill (H) name = ( @@ -385,7 +385,7 @@ def scrape(self, session=None): actions = client.unpaginate(actions) except scrapelib.HTTPError: self.logger.warning("Could not find bill actions page") - actions = {"items": []} + actions = [] for a in actions: action_desc = a["description"] From 4be0b12815f34503ecb977a8cccf3dc20f040fe2 Mon Sep 17 00:00:00 2001 From: braykuka Date: Thu, 31 Oct 2024 08:31:59 +0100 Subject: [PATCH 11/14] fix lint issue --- scrapers/in/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapers/in/__init__.py b/scrapers/in/__init__.py index 299867b480..302de2d1bf 100644 --- a/scrapers/in/__init__.py +++ b/scrapers/in/__init__.py @@ -3,7 +3,6 @@ from openstates.scrape import State from .bills import INBillScraper from .events import INEventScraper -from .committees import INCommitteeScraper settings = dict(SCRAPELIB_TIMEOUT=600) From 41736ae57ec56b9187ab01688f9fd78b7d4fc442 Mon Sep 17 00:00:00 2001 From: braykuka Date: Thu, 31 Oct 2024 08:37:15 +0100 Subject: [PATCH 12/14] fix the duplicate of version pdfs --- scrapers/in/bills.py | 51 -------------------------------------------- 1 file changed, 51 deletions(-) diff --git a/scrapers/in/bills.py b/scrapers/in/bills.py index 6e26047ffa..cc2393d743 100644 --- a/scrapers/in/bills.py +++ b/scrapers/in/bills.py @@ -207,57 +207,6 @@ def deal_with_latest_version( note=title, url=link, media_type="application/pdf" ) - # version which can sometimes have the wrong stageVerbose - # add check that last letter of printVersionName matches - # ex: stageVerbose being House Bill (H) - # and printVersionName being HB1189.03.COMS and the link - # being for HB1189.03.COMS which is the Senate bill - # some example bills in 2020 are HB1189, SB241, SB269, HC18 - versions_match = True - # get version chamber and api name, check chamber - version_chamber = version["printVersionName"][-1] - api_version_name = version["stageVerbose"] - # check any versions not enrolled or introduced which are correct - api_name_chamber = re.search( - r"^(?:Engrossed |)(?:House|Senate) (?:Bill|Resolution) \((.)\)", - api_version_name, - ) - if api_name_chamber is not None: - if version_chamber != api_name_chamber[1]: - versions_match = False - - link = f"{api_base_url}{version['link']}?format=pdf" - # if the chambers don't match, swap the chamber on version name - # ex: Engrossed Senate Bill (S) to Engrossed Senate Bill (H) - name = ( - api_version_name - if versions_match - else api_version_name[:-2] + version_chamber + api_version_name[-1:] - ) - if link not in urls_seen: - urls_seen.append(link) - update_date = version["updated"] - create_date = version["created"] - intro_date = version["introduced"] - file_date = version["filed"] - for d in [update_date, create_date, intro_date, file_date]: - try: - # pupa choked when I passed datetimes, so passing dates only. - # If we figure out how to make pupa not choke, here's the line you want: - # ## # - # self._tz.localize(datetime.datetime.strptime(d, "%Y-%m-%dT%H:%M:%S")) - update_date = datetime.datetime.strptime( - d, "%Y-%m-%dT%H:%M:%S" - ).date() - except TypeError: - continue - else: - break - - bill.add_version_link( - note=name, url=link, media_type="application/pdf", date=update_date - ) - def scrape(self, session=None): self._bill_prefix_map = { "HB": {"type": "bill", "url_segment": "bills/house"}, From 83b13a1275d44ee7ca30d65c7eb2031c860d6fda Mon Sep 17 00:00:00 2001 From: braykuka Date: Thu, 31 Oct 2024 08:52:35 +0100 Subject: [PATCH 13/14] add commnets for the proxy based urls --- scrapers/in/bills.py | 4 +++- scrapers/in/events.py | 23 +++++++++++++---------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/scrapers/in/bills.py b/scrapers/in/bills.py index cc2393d743..58db3eb0df 100644 --- a/scrapers/in/bills.py +++ b/scrapers/in/bills.py @@ -311,7 +311,7 @@ def scrape(self, session=None): ) bill.add_source(self._get_bill_url(session, bill_id)) - bill.add_source(api_source) + bill.add_source(api_source, note="API details") # sponsors for s in bill_json["authors"]: @@ -419,6 +419,8 @@ def scrape(self, session=None): # https://iga.in.gov/pdf-documents/123/2024/house/resolutions/HC0001/HC0001.01.INTR.pdf category = "resolutions" if "resolution" in bill_type else "bills" url = f"https://iga.in.gov/pdf-documents/{self.session_no}/{bill_json['year']}/{bill_json['originChamber']}/{category}/{v['billName']}/{v['printVersionName']}.pdf" + # PROXY URL + # url = urljoin(PROXY_BASE_URL, v['link']) bill.add_version_link( v["stageVerbose"], url, diff --git a/scrapers/in/events.py b/scrapers/in/events.py index ac75045e14..060bf8b9d7 100644 --- a/scrapers/in/events.py +++ b/scrapers/in/events.py @@ -13,15 +13,11 @@ log = logging.getLogger(__name__) +PROXY_BASE_URL = "https://in-proxy.openstates.org/" class INEventScraper(Scraper): _tz = pytz.timezone("America/Indianapolis") - # avoid cloudflare blocks for no UA - cf_headers = { - "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/108.0.0.0 Safari/537.36" # noqa - } base_url = "https://beta-api.iga.in.gov" session = date.today().year @@ -91,6 +87,7 @@ def scrape(self): event.add_source(link, note="API details") name_slug = committee_name.lower().replace(" ", "-") name_slug = re.sub("[^a-zA-Z0-9]+", "-", committee_name.lower()) + document_url = f"https://iga.in.gov/pdf-documents/{session_no}/{self.session}/{committee_chamber}/committees/{committee_type}/{name_slug}/{_id}/meeting.pdf" event.add_source( @@ -118,10 +115,13 @@ def scrape(self): agenda.add_subject(agenda_item["description"]) for exhibit in meeting.get("exhibits"): - exhibit_pdf_url = self.apiclient.get_document_url( - exhibit["pdfDownloadLink"] - ) - self.logger.info(exhibit["pdfDownloadLink"]) + # Original URL + # exhibit_pdf_url = self.apiclient.get_document_url( + # exhibit["pdfDownloadLink"] + # ) + # Proxy URL + exhibit_pdf_url = urljoin(PROXY_BASE_URL, exhibit["pdfDownloadLink"]) + self.logger.info(exhibit_pdf_url) if exhibit_pdf_url: event.add_document( exhibit["description"], @@ -131,7 +131,10 @@ def scrape(self): for minute in meeting.get("minutes"): if minute["link"]: - minute_pdf_url = f"https://iga.in.gov/pdf-documents/{session_no}/{self.session}/{committee_chamber}/committees/{committee_type}/{name_slug}/{_id}/{_id}_minutes.pdf" + # Original URL + # minute_pdf_url = f"https://iga.in.gov/pdf-documents/{session_no}/{self.session}/{committee_chamber}/committees/{committee_type}/{name_slug}/{_id}/{_id}_minutes.pdf" + # Proxy URL + minute_pdf_url = urljoin(PROXY_BASE_URL, minute["link"]) event.add_document( "Meeting Minutes", minute_pdf_url, From 3e71fd0f6644862981a574078afe5bd0e7f0cc67 Mon Sep 17 00:00:00 2001 From: Jesse Mortenson Date: Thu, 31 Oct 2024 13:17:57 -0500 Subject: [PATCH 14/14] IN: Add comments, fix typo, remove unused code --- scrapers/in/events.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scrapers/in/events.py b/scrapers/in/events.py index 060bf8b9d7..9deccc3b50 100644 --- a/scrapers/in/events.py +++ b/scrapers/in/events.py @@ -85,7 +85,6 @@ def scrape(self): ) event.dedupe_key = event_name event.add_source(link, note="API details") - name_slug = committee_name.lower().replace(" ", "-") name_slug = re.sub("[^a-zA-Z0-9]+", "-", committee_name.lower()) document_url = f"https://iga.in.gov/pdf-documents/{session_no}/{self.session}/{committee_chamber}/committees/{committee_type}/{name_slug}/{_id}/meeting.pdf" @@ -96,7 +95,7 @@ def scrape(self): ) event.add_participant(committee_name, type="committee", note="host") event.add_document( - "Meeting Agenda", document_url, media_type="applicaiton/pdf" + "Meeting Agenda", document_url, media_type="application/pdf" ) event.add_media_link("Video of Hearing", video_url, media_type="text/html") @@ -119,7 +118,7 @@ def scrape(self): # exhibit_pdf_url = self.apiclient.get_document_url( # exhibit["pdfDownloadLink"] # ) - # Proxy URL + # Proxy URL used because URL provided by API is not directly accessible over the web exhibit_pdf_url = urljoin(PROXY_BASE_URL, exhibit["pdfDownloadLink"]) self.logger.info(exhibit_pdf_url) if exhibit_pdf_url: @@ -133,7 +132,7 @@ def scrape(self): if minute["link"]: # Original URL # minute_pdf_url = f"https://iga.in.gov/pdf-documents/{session_no}/{self.session}/{committee_chamber}/committees/{committee_type}/{name_slug}/{_id}/{_id}_minutes.pdf" - # Proxy URL + # Proxy URL used because URL provided by API is not directly accessible over the web minute_pdf_url = urljoin(PROXY_BASE_URL, minute["link"]) event.add_document( "Meeting Minutes",