Merge pull request #4917 from braykuka/1185-il-votes-scraper-fragility

IL: Vote Scraper Fragility
openstates · Apr 12, 2024 · 444e5c7 · 444e5c7
2 parents b903da1 + 0fd753c
commit 444e5c7
Showing 1 changed file with 31 additions and 49 deletions.
diff --git a/scrapers/il/bills.py b/scrapers/il/bills.py
@@ -7,11 +7,10 @@
 import lxml.html
 from openstates.scrape import Scraper, Bill, VoteEvent
 from openstates.utils import convert_pdf
+from ._utils import canonicalize_url
 
 central = pytz.timezone("US/Central")
 
-# from ._utils import canonicalize_url
-
 
 session_details = {
     "103rd": {
@@ -504,7 +503,7 @@ def scrape_bill(self, chamber, session, doc_type, url, bill_type=None):
         sponsor_list = build_sponsor_list(doc.xpath('//a[contains(@class, "content")]'))
         # don't add just yet; we can make them better using action data
 
-        # committee_actors = {}
+        committee_actors = {}
 
         # actions
         action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td')
@@ -520,15 +519,18 @@ def scrape_bill(self, chamber, session, doc_type, url, bill_type=None):
             action = action_elem.text_content()
             classification, related_orgs = _categorize_action(action)
 
-            # if related_orgs and any(c.startswith("committee") for c in classification):
-            #     ((name, source),) = [
-            #         (a.text, a.get("href"))
-            #         for a in action_elem.xpath("a")
-            #         if "committee" in a.get("href")
-            #     ]
-            #     source = canonicalize_url(source)
-            #     actor_id = {"sources__url": source, "classification": "committee"}
-            #     committee_actors[source] = name
+            if related_orgs and any(c.startswith("committee") for c in classification):
+                try:
+                    ((name, source),) = [
+                        (a.text, a.get("href"))
+                        for a in action_elem.xpath("a")
+                        if "committee" in a.get("href")
+                    ]
+                    source = canonicalize_url(source)
+                    actor_id = {"sources__url": source, "classification": "committee"}
+                    committee_actors[source] = name
+                except ValueError:
+                    self.warning("Can't resolve voting body for %s" % classification)
 
             bill.add_action(
                 action,
@@ -560,8 +562,8 @@ def scrape_bill(self, chamber, session, doc_type, url, bill_type=None):
         yield bill
 
         # temporarily remove vote processing due to pdf issues
-        # votes_url = doc.xpath('//a[text()="Votes"]/@href')[0]
-        # yield from self.scrape_votes(session, bill, votes_url, committee_actors)
+        votes_url = doc.xpath('//a[text()="Votes"]/@href')[0]
+        yield from self.scrape_votes(session, bill, votes_url, committee_actors)
 
     def scrape_documents(self, bill, version_url):
         html = self.get(version_url).text
@@ -633,7 +635,6 @@ def scrape_votes(self, session, bill, votes_url, committee_actors):
         doc.make_links_absolute(votes_url)
 
         for link in doc.xpath('//a[contains(@href, "votehistory")]'):
-
             if link.get("href") in DUPE_VOTES:
                 continue
 
@@ -747,6 +748,7 @@ def scrape_pdf_for_votes(self, session, actor, date, motion, href):
         passed = None
         counts_found = False
         vote_lines = []
+
         for line in pdflines:
             # consider pass/fail as a document property instead of a result of the vote count
             # extract the vote count from the document instead of just using counts of names
@@ -897,46 +899,26 @@ def refine_sponsor_list(self, chamber, action, sponsor_list, bill_id):
 
 
 def find_columns_and_parse(vote_lines):
-    columns = find_columns(vote_lines)
     votes = {}
-    for line in vote_lines:
-        for idx in reversed(columns):
-            bit = line[idx:]
-            line = line[:idx]
-            if bit:
-                vote, name = bit.split(" ", 1)
-                votes[name.strip()] = vote
-    return votes
 
+    for line in vote_lines:
+        for name, vote in correct_line(line.strip()):
+            votes[name] = vote
 
-def _is_potential_column(line, i):
-    for val in VOTE_VALUES:
-        if re.search(r"^%s\s{2,10}(\w.).*" % val, line[i:]):
-            return True
-    return False
+    return votes
 
 
-def find_columns(vote_lines):
-    potential_columns = []
+def correct_line(line):
+    data = []
 
-    for line in vote_lines:
-        pcols = set()
-        for i, x in enumerate(line):
-            if _is_potential_column(line, i):
-                pcols.add(i)
-        potential_columns.append(pcols)
-
-    starter = potential_columns[0]
-    for pc in potential_columns[1:-1]:
-        starter.intersection_update(pc)
-    last_row_cols = potential_columns[-1]
-    if not last_row_cols.issubset(starter):
-        raise Exception(
-            "Row's columns [%s] don't align with candidate final columns [%s]: %s"
-            % (last_row_cols, starter, line)
-        )
-    # we should now only have values that appeared in every line
-    return sorted(starter)
+    name = ""
+    for word in reversed(line.split()):
+        if name and word in VOTE_VALUES:
+            data.append((name, word))
+            name = ""
+            continue
+        name = " ".join([word, name]).strip()
+    return data
 
 
 def build_sponsor_list(sponsor_atags):