IN: change IN API URL due to launch, fix vote PDF fetching

openstates · Dec 17, 2024 · e573fe6 · e573fe6
1 parent 4bd1cf4
commit e573fe6
Show file tree

Hide file tree

Showing 5 changed files with 72 additions and 12 deletions.
diff --git a/scrapers/in/__init__.py b/scrapers/in/__init__.py
@@ -179,6 +179,6 @@ def get_session_list(self):
             "Accept": "application/json",
             "User-Agent": useragent,
         }
-        resp = requests.get("https://beta-api.iga.in.gov", headers=headers)
+        resp = requests.get("https://api.iga.in.gov", headers=headers)
         resp.raise_for_status()
         return [session["name"] for session in resp.json()["sessions"]]
diff --git a/scrapers/in/apiclient.py b/scrapers/in/apiclient.py
@@ -50,10 +50,10 @@ def wrapped(self, *args, **kwargs):
 
 class ApiClient(object):
     """
-    docs: https://docs.beta-api.iga.in.gov
+    docs: https://docs.api.iga.in.gov/
     """
 
-    root = "https://beta-api.iga.in.gov"
+    root = "https://api.iga.in.gov"
     resources = dict(
         sessions="/",
         session="/{session}",
@@ -130,6 +130,22 @@ def get_relurl(self, url):
         self.scraper.info("Api GET: %r, %r" % (url, headers))
         return self.scraper.get(url, headers=headers)
 
+    # fetch an API url where we expect a redirect
+    # return the new redirect URL (do not fetch it yet)
+    def identify_redirect_url(self, url):
+        if self.root not in url:
+            url = urljoin(self.root, url)
+        headers = {}
+        headers["x-api-key"] = self.apikey
+        headers["Accept"] = "application/json"
+        headers["User-Agent"] = self.user_agent
+        response = requests.get(url, headers=headers, allow_redirects=False)
+        if response.status_code in (301, 302):
+            return response.headers["Location"]
+        else:
+            self.scraper.error(f"Failed to get expected redirect URL from {url}")
+            return None
+
     def make_url(self, resource_name, **url_format_args):
         # Build up the url.
         url = self.resources[resource_name]

diff --git a/scrapers/in/bills.py b/scrapers/in/bills.py
@@ -57,7 +57,7 @@ def _get_bill_url(self, session, bill_id):
 
         return url_template.format(session, url_segment, bill_number)
 
-    def _process_votes(self, rollcalls, bill_id, original_chamber, session):
+    def _process_votes(self, rollcalls, bill_id, original_chamber, session, client):
         result_types = {
             "FAILED": False,
             "DEFEATED": False,
@@ -70,19 +70,49 @@ def _process_votes(self, rollcalls, bill_id, original_chamber, session):
         }
 
         for r in rollcalls:
-            proxy_link = PROXY_BASE_URL + r["link"]
+            # each value in rollcalls is an API metadata object describing the rollcall:
+            # it does not include the PDF link explicitly (this can be requested from the "link" url)
+            # but you can add ?format=pdf to the end of the "link" url to synthesize it
+            # {
+            # 	"target": "HB1001.03.COMH",
+            # 	"chamber": {
+            # 		"link": "/2024/chambers/house",
+            # 		"name": "House"
+            # 	},
+            # 	"rollcall_number": "26",
+            # 	"results": {
+            # 		"yea": 80,
+            # 		"nay": 17
+            # 	},
+            # 	"link": "/2024/rollcalls/{ID_GOES_HERE}}",
+            # 	"type": "BILL"
+            # }
+            # however the PDF url does not return the PDF content immediately
+            # it returns a 302 redirect to the actual PDF url
+            # AND the actual PDF url is sensitive to the incoming User Agent header
+            vote_url = client.identify_redirect_url(r["link"] + "?format=pdf")
             try:
-                path, _ = self.urlretrieve(proxy_link)
+                headers = {
+                    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0"
+                }
+                path, ret_response = self.urlretrieve(vote_url, headers=headers)
             except scrapelib.HTTPError:
                 self.logger.warning(
-                    "Unable to contact openstates proxy, skipping vote {}".format(
-                        proxy_link
+                    "HTTP error fetching vote URL, skipping vote {}".format(
+                        vote_url
                     )
                 )
                 continue
 
-            text = convert_pdf(path, "text").decode("utf-8")
+            # Looks like a missing PDF file ends up being displayed as "404" content in HTML
+            # instead of server returning a proper 404
+            # so sanity check to see if content appears to be HTML instead of PDF
+            if ret_response.headers["Content-Type"] != "application/pdf":
+                self.logger.warning(f"Got unexpected response type {ret_response.headers.get('Content-Type')},"
+                                    f" skipping {vote_url}")
+                continue
 
+            text = convert_pdf(path, "text").decode("utf-8")
             lines = text.split("\n")
             os.remove(path)
 
@@ -141,11 +171,24 @@ def _process_votes(self, rollcalls, bill_id, original_chamber, session):
                 classification="passage",
             )
 
+            # Historically we've counted yeas/nays/excused/NV from parsing the PDF
+            # but now the API response provides yea and nay counts
+            # let's prefer those counts and log if a difference is found
+            api_yea = int(r["results"]["yea"])
+            api_nay = int(r["results"]["nay"])
+            if yeas != api_yea:
+                self.warning(f"API yea count {api_yea} does not match PDF parse {yeas} "
+                             f"at API {r['link']}, PDF {vote_url}")
+                yeas = api_yea
+            if nays != api_nay:
+                self.warning(f"API nay count {api_nay} does not match PDF parse {nays} "
+                             f"at API {r['link']}, PDF {vote_url}")
+                nays = api_nay
             vote.set_count("yes", yeas)
             vote.set_count("no", nays)
             vote.set_count("excused", excused)
             vote.set_count("not voting", not_voting)
-            vote.add_source(proxy_link)
+            vote.add_source(vote_url)
 
             currently_counting = ""
 
@@ -414,6 +457,7 @@ def scrape(self, session=None):
                 disp_bill_id,
                 original_chamber,
                 session,
+                client,
             )
 
             for v in bill_json["versions"]:

diff --git a/scrapers/in/committees.py b/scrapers/in/committees.py
@@ -54,7 +54,7 @@ def get_subcommittee_info(self, session):
     def scrape(self, session):
         subcomms = self.get_subcommittee_info(session)
 
-        api_base_url = "https://beta-api.iga.in.gov"
+        api_base_url = "https://api.iga.in.gov"
         html_base_url = "http://iga.in.gov/legislative/{}/committees/".format(session)
         client = ApiClient(self)
         r = client.get("committees", session=session)

diff --git a/scrapers/in/events.py b/scrapers/in/events.py
@@ -13,7 +13,7 @@
 
 class INEventScraper(Scraper):
     _tz = pytz.timezone("America/Indianapolis")
-    base_url = "https://beta-api.iga.in.gov"
+    base_url = "https://api.iga.in.gov"
     session = date.today().year
 
     def __init__(self, *args, **kwargs):