Skip to content

Commit

Permalink
IN: change IN API URL due to launch, fix vote PDF fetching
Browse files Browse the repository at this point in the history
  • Loading branch information
jessemortenson committed Dec 17, 2024
1 parent 4bd1cf4 commit e573fe6
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 12 deletions.
2 changes: 1 addition & 1 deletion scrapers/in/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,6 @@ def get_session_list(self):
"Accept": "application/json",
"User-Agent": useragent,
}
resp = requests.get("https://beta-api.iga.in.gov", headers=headers)
resp = requests.get("https://api.iga.in.gov", headers=headers)
resp.raise_for_status()
return [session["name"] for session in resp.json()["sessions"]]
20 changes: 18 additions & 2 deletions scrapers/in/apiclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ def wrapped(self, *args, **kwargs):

class ApiClient(object):
"""
docs: https://docs.beta-api.iga.in.gov
docs: https://docs.api.iga.in.gov/
"""

root = "https://beta-api.iga.in.gov"
root = "https://api.iga.in.gov"
resources = dict(
sessions="/",
session="/{session}",
Expand Down Expand Up @@ -130,6 +130,22 @@ def get_relurl(self, url):
self.scraper.info("Api GET: %r, %r" % (url, headers))
return self.scraper.get(url, headers=headers)

# fetch an API url where we expect a redirect
# return the new redirect URL (do not fetch it yet)
def identify_redirect_url(self, url):
if self.root not in url:
url = urljoin(self.root, url)
headers = {}
headers["x-api-key"] = self.apikey
headers["Accept"] = "application/json"
headers["User-Agent"] = self.user_agent
response = requests.get(url, headers=headers, allow_redirects=False)
if response.status_code in (301, 302):
return response.headers["Location"]
else:
self.scraper.error(f"Failed to get expected redirect URL from {url}")
return None

def make_url(self, resource_name, **url_format_args):
# Build up the url.
url = self.resources[resource_name]
Expand Down
58 changes: 51 additions & 7 deletions scrapers/in/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _get_bill_url(self, session, bill_id):

return url_template.format(session, url_segment, bill_number)

def _process_votes(self, rollcalls, bill_id, original_chamber, session):
def _process_votes(self, rollcalls, bill_id, original_chamber, session, client):
result_types = {
"FAILED": False,
"DEFEATED": False,
Expand All @@ -70,19 +70,49 @@ def _process_votes(self, rollcalls, bill_id, original_chamber, session):
}

for r in rollcalls:
proxy_link = PROXY_BASE_URL + r["link"]
# each value in rollcalls is an API metadata object describing the rollcall:
# it does not include the PDF link explicitly (this can be requested from the "link" url)
# but you can add ?format=pdf to the end of the "link" url to synthesize it
# {
# "target": "HB1001.03.COMH",
# "chamber": {
# "link": "/2024/chambers/house",
# "name": "House"
# },
# "rollcall_number": "26",
# "results": {
# "yea": 80,
# "nay": 17
# },
# "link": "/2024/rollcalls/{ID_GOES_HERE}}",
# "type": "BILL"
# }
# however the PDF url does not return the PDF content immediately
# it returns a 302 redirect to the actual PDF url
# AND the actual PDF url is sensitive to the incoming User Agent header
vote_url = client.identify_redirect_url(r["link"] + "?format=pdf")
try:
path, _ = self.urlretrieve(proxy_link)
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0"
}
path, ret_response = self.urlretrieve(vote_url, headers=headers)
except scrapelib.HTTPError:
self.logger.warning(
"Unable to contact openstates proxy, skipping vote {}".format(
proxy_link
"HTTP error fetching vote URL, skipping vote {}".format(
vote_url
)
)
continue

text = convert_pdf(path, "text").decode("utf-8")
# Looks like a missing PDF file ends up being displayed as "404" content in HTML
# instead of server returning a proper 404
# so sanity check to see if content appears to be HTML instead of PDF
if ret_response.headers["Content-Type"] != "application/pdf":
self.logger.warning(f"Got unexpected response type {ret_response.headers.get('Content-Type')},"
f" skipping {vote_url}")
continue

text = convert_pdf(path, "text").decode("utf-8")
lines = text.split("\n")
os.remove(path)

Expand Down Expand Up @@ -141,11 +171,24 @@ def _process_votes(self, rollcalls, bill_id, original_chamber, session):
classification="passage",
)

# Historically we've counted yeas/nays/excused/NV from parsing the PDF
# but now the API response provides yea and nay counts
# let's prefer those counts and log if a difference is found
api_yea = int(r["results"]["yea"])
api_nay = int(r["results"]["nay"])
if yeas != api_yea:
self.warning(f"API yea count {api_yea} does not match PDF parse {yeas} "
f"at API {r['link']}, PDF {vote_url}")
yeas = api_yea
if nays != api_nay:
self.warning(f"API nay count {api_nay} does not match PDF parse {nays} "
f"at API {r['link']}, PDF {vote_url}")
nays = api_nay
vote.set_count("yes", yeas)
vote.set_count("no", nays)
vote.set_count("excused", excused)
vote.set_count("not voting", not_voting)
vote.add_source(proxy_link)
vote.add_source(vote_url)

currently_counting = ""

Expand Down Expand Up @@ -414,6 +457,7 @@ def scrape(self, session=None):
disp_bill_id,
original_chamber,
session,
client,
)

for v in bill_json["versions"]:
Expand Down
2 changes: 1 addition & 1 deletion scrapers/in/committees.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def get_subcommittee_info(self, session):
def scrape(self, session):
subcomms = self.get_subcommittee_info(session)

api_base_url = "https://beta-api.iga.in.gov"
api_base_url = "https://api.iga.in.gov"
html_base_url = "http://iga.in.gov/legislative/{}/committees/".format(session)
client = ApiClient(self)
r = client.get("committees", session=session)
Expand Down
2 changes: 1 addition & 1 deletion scrapers/in/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

class INEventScraper(Scraper):
_tz = pytz.timezone("America/Indianapolis")
base_url = "https://beta-api.iga.in.gov"
base_url = "https://api.iga.in.gov"
session = date.today().year

def __init__(self, *args, **kwargs):
Expand Down

0 comments on commit e573fe6

Please sign in to comment.