Skip to content

Commit

Permalink
Merge pull request #5062 from braykuka/1207-IN-bills-scraper-rewrite-…
Browse files Browse the repository at this point in the history
…2025

IN: bills and events scraper rewrite 2025
  • Loading branch information
jessemortenson authored Oct 31, 2024
2 parents 1f6867b + 3e71fd0 commit 22a2018
Show file tree
Hide file tree
Showing 6 changed files with 222 additions and 302 deletions.
6 changes: 3 additions & 3 deletions scrapers/in/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,10 +175,10 @@ def get_session_list(self):
apikey = os.environ["INDIANA_API_KEY"]
useragent = os.getenv("USER_AGENT", "openstates")
headers = {
"Authorization": apikey,
"x-api-key": apikey,
"Accept": "application/json",
"User-Agent": useragent,
}
resp = requests.get("https://api.iga.in.gov/sessions", headers=headers)
resp = requests.get("https://beta-api.iga.in.gov", headers=headers)
resp.raise_for_status()
return [session["name"] for session in resp.json()["items"]]
return [session["name"] for session in resp.json()["sessions"]]
60 changes: 46 additions & 14 deletions scrapers/in/apiclient.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import os
import re
import time
from urllib.parse import urljoin
import functools
import requests

"""
API key must be passed as a header. You need the following headers to get JSON:
Authorization = your_apikey
x-api-key = your_apikey
Accept = "application/json"
If you're trying to hit api links through your browser you
Expand Down Expand Up @@ -48,37 +50,67 @@ def wrapped(self, *args, **kwargs):

class ApiClient(object):
"""
docs: http://docs.api.iga.in.gov/
docs: https://docs.beta-api.iga.in.gov
"""

root = "https://api.iga.in.gov/"
root = "https://beta-api.iga.in.gov"
resources = dict(
sessions="/sessions",
sessions="/",
session="/{session}",
subjects="/{session}/subjects",
chambers="/{session}/chambers",
bills="/{session}/bills",
bill="/{session}/bills/{bill_id}",
bill="{bill_link}",
chamber_bills="/{session}/chambers/{chamber}/bills",
# note that rollcall_id has to be pulled off the URL, it's NOT the rollcall_number
rollcalls="/{session}/rollcalls/{rollcall_id}",
bill_actions="/{session}/bills/{bill_id}/actions",
rollcalls="/{session}/rollcalls",
rollcall="{rollcall_link}",
meetings="/{session}/meetings",
meeting="{meeting_link}",
bill_actions="{action_link}",
committees="/{session}/committees",
committee="/{committee_link}",
committee="{committee_link}",
legislators="/{session}/legislators",
legislator="/{session}/legislators/{legislator_id}",
legislator="{legislator_link}",
chamber_legislators="/{session}/chambers/{chamber}/legislators",
bill_version="/{session}/bills/{bill_id}/versions/{version_id}",
fiscal_notes="/{session}/fiscal-notes",
document="{doc_link}",
)

def __init__(self, scraper):
self.scraper = scraper
self.apikey = os.environ["INDIANA_API_KEY"]
self.user_agent = os.getenv("USER_AGENT", "openstates")

def get_session_no(self, session):
session_no = ""
headers = {}
headers["x-api-key"] = self.apikey
headers["Accept"] = "application/json"
headers["User-Agent"] = self.user_agent
url = urljoin(self.root, f"/{session}")
resp = requests.get(url, headers=headers).json()
session_no_regex = re.search(r"Session\s+(\d+).+", resp["name"])

if session_no_regex:
session_no = session_no_regex.group(1)

return session_no

def get_document_url(self, url):
headers = {}
headers["x-api-key"] = self.apikey
headers["Accept"] = "application/pdf"
headers["User-Agent"] = self.user_agent
url = urljoin(self.root, url)
resp = requests.get(url, headers=headers, allow_redirects=False)
if "Location" in resp.headers:
return resp.headers["Location"]

@check_response
def geturl(self, url):
headers = {}
headers["Authorization"] = self.apikey
headers["x-api-key"] = self.apikey
headers["Accept"] = "application/json"
headers["User-Agent"] = self.user_agent
self.scraper.info("Api GET next page: %r, %r" % (url, headers))
Expand All @@ -87,8 +119,8 @@ def geturl(self, url):
@check_response
def get_relurl(self, url):
headers = {}
headers["Authorization"] = self.apikey
headers["Accept"] = "application/json"
headers["x-api-key"] = self.apikey
headers["Accept"] = "application/pdf"
headers["User-Agent"] = self.user_agent
url = urljoin(self.root, url)
self.scraper.info("Api GET: %r, %r" % (url, headers))
Expand All @@ -113,7 +145,7 @@ def get(
requests_args = requests_args or ()
requests_kwargs = requests_kwargs or {}
headers = requests_kwargs.get("headers", {})
headers["Authorization"] = self.apikey
headers["x-api-key"] = self.apikey
headers["Accept"] = "application/json"
headers["User-Agent"] = self.user_agent
requests_kwargs["headers"] = headers
Expand Down
Loading

0 comments on commit 22a2018

Please sign in to comment.