From 58b674ac6d6acc68e4f1fbf44bcca38126a315ef Mon Sep 17 00:00:00 2001 From: Christopher Yamas Date: Fri, 26 Jul 2024 09:58:47 -0700 Subject: [PATCH 1/9] MA: add vote scraper to init --- scrapers/ma/__init__.py | 2 + scrapers/ma/votes.py | 473 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 475 insertions(+) create mode 100644 scrapers/ma/votes.py diff --git a/scrapers/ma/__init__.py b/scrapers/ma/__init__.py index 46773f46f4..d550a21830 100644 --- a/scrapers/ma/__init__.py +++ b/scrapers/ma/__init__.py @@ -4,12 +4,14 @@ from openstates.scrape import State from .bills import MABillScraper from .events import MAEventScraper +from .votes import MAVoteScraper class Massachusetts(State): scrapers = { "bills": MABillScraper, "events": MAEventScraper, + "votes": MAVoteScraper } legislative_sessions = [ { diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py new file mode 100644 index 0000000000..7b5b82dade --- /dev/null +++ b/scrapers/ma/votes.py @@ -0,0 +1,473 @@ +from spatula import ( + URL, + HtmlListPage, + PdfPage, + HtmlPage, + XPath, +) +from openstates.scrape import VoteEvent, Scraper +import re +import pytz +import datetime as dt + + +class VoteTotalMismatch(Exception): + def __init__(self): + super().__init__("Vote total mismatch") + + +class MAVoteScraper(Scraper): + def scrape(self, session=None): + # yield from HouseJournalDirectory().do_scrape() + yield from SenateJournalDirectory().do_scrape() + + +class HouseJournalDirectory(HtmlPage): + source = URL("http://malegislature.gov/Journal/House/192", verify=False) + + def process_page(self): + # find all links to a file called RollCalls + # One of these files exists for each year directory + roll_calls = [ + x for x in XPath("//a/@href").match(self.root) if x.endswith("RollCalls") + ] + for rc in roll_calls: + vote_events = HouseRollCall(source=URL(rc, verify=False)).do_scrape() + for vote_event in vote_events: + vote_event.add_source(self.source.url, note="House journal listing") + yield vote_event + + +class SenateJournalDirectory(HtmlListPage): + source = URL("https://malegislature.gov/Journal/Senate", verify=False) + votes_list = [] + + def process_page(self): + # Find all link to each month + month_links = XPath("//a[@aria-controls='journalList']/@href").match(self.root) + for month_link in month_links: + vote_events = SenateJournalMonth( + source=URL(month_link, verify=False), + votes_list=self.votes_list + ).do_scrape() + for vote_event in vote_events: + # vote_event.add_source(self.source.url, note="Senate jouenal listing") + yield vote_event + + +class SenateJournalMonth(HtmlListPage): + + def __init__(self, source, votes_list): + super().__init__(source=source) + self.votes_list = votes_list + + def process_page(self): + journal_pdf_links = XPath("//tr/td/a/@href").match(self.root) + for journal_pdf_link in journal_pdf_links: + # if journal_pdf_link in ( + # "https://malegislature.gov/Journal/Senate/193/806/sj03302023_1100AM.pdf", + # "https://malegislature.gov/Journal/Senate/193/768/sj03232023_0100PM.pdf", + # ): + yield SenateJournal(source=URL(journal_pdf_link, verify=False), + votes_list=self.votes_list + ) + + +class SenateJournal(PdfPage): + motion_and_vote_total = r"((?P.{900}))\(yeas\s*(?P\d+)\s*(?:-|to)\s*nays.(?P\d+)\)" + vote_id = r"\[Yeas\.?.and.Nays\.?.No\.?.(?P\d+)\]" + vote_section = r"YEAS(?P.*?)(?P\d+)\.(?P.*?)NAYS(?P.*?)(?P\d+)(?:(?P.{0,30}?)ABSENT OR NOT VOTING(?P.*?)(?P\d+))?" + + motion_and_vote_total_re = re.compile(motion_and_vote_total, re.DOTALL) + vote_id_re = re.compile(vote_id, re.DOTALL) + vote_section_re = re.compile(vote_section, re.DOTALL) + + total_vote_re = re.compile(f"{motion_and_vote_total}.*?{vote_id}.*?{vote_section}", re.DOTALL) + + not_name_re = re.compile(r"^(\d|UNCORRECTED|Joint Rules|\.|Real ID,-- homeless" + r"|Pharmacists,-- PrEP\." + r"|Brewster,-- land\." + r"|Provincetown,-wastewater" + r")" + ) + text = None + + precise_motion = r"question on\s+(.+)\s+was determined" + precise_motion_re = re.compile(precise_motion, re.DOTALL) + + bill_id = r"(House|Senate),\s+No\.\s+(\d+)" + bill_id_re = re.compile(bill_id, re.DOTALL) + + motion_classification = { + r"passing.+engross": "engrossment", + r"adoption.+amendment": "amendment-passage", + r"acceptance.+report": "report-acceptance", + r"passing.+enacted": "passage", + r"approving.+plan": "passage", + } + + date_time_re = re.compile(r"sj(\d{8})_(?:\d{3,4}[AM|PM])\.pdf") + + journal_date = None + + def __init__(self, source, votes_list): + super().__init__(source=source) + self.votes_list = votes_list + + def process_date(self): + # Find the match + datetime_match = self.date_time_re.search(self.source.url) + + if datetime_match: + date_str = datetime_match.group(1) + vote_date = dt.datetime.strptime(date_str, "%Y%m%d") + formatted_date = vote_date.strftime("%Y-%m-%d") + + print("Formatted date:", formatted_date) + return formatted_date + + else: + raise Exception(f"Datetime with known format not in pdf url: " + f"{self.source.url}") + + def process_page(self): + vote_date = self.process_date() + + # Remove special characters that look like the - character + self.text = self.text.replace("–", "-").replace("−", "-").replace("—", "-") + + # Search for each of the three components of the larger regex separately. + votes_mt = self.motion_and_vote_total_re.findall(self.text) + votes_s = self.vote_section_re.findall(self.text) + votes_id = self.vote_id_re.findall(self.text) + # Check to make sure they all found the same number of matches + # If they disagree on number of matches, the scraper will not get + # the data correctly so emit a warning and skip this pdf. + if not (len(votes_mt) == len(votes_s) == len(votes_id)): + self.logger.warn(f"\nCould not accurately parse votes for " + f"{self.source.url}\n" + f"len(votes_mt):{len(votes_mt)}\n" + f"len(votes_s):{len(votes_s)}\n" + f"len(votes_id):{len(votes_id)}\n" + f"{self.text}") + else: + # Run full regex search. + votes = self.total_vote_re.finditer(self.text) + votes = [self.parse_match(v) for v in votes] + # yield from + print("\n\n".join([str(x) for x in votes])) + # print(self.source.url) + + def parse_match(self, match): + + # TODO: to get bill_id, it needs to treat each vote separately + # and be able to search backwards in the text for most proximal + # bill_id format match (regex or another approach may be needed) + + # You can ignore the current code that attempts to get a bill_id match, as its + # about to be deprecated by the below described solution. + + """ + The reason why it can't just grab it in the current solution is you would have to go + far back enough into the prior paragraphs to be able to find the bill_id, but not + too far back because then it prevents matches on other votes when vote lines are + positioned too close together in the PDF. This is easier to do for the motion text + than the bill_id + + That is why the motion_and_vote_total regex pattern defined at the top of this class + gets 900 characters preceding each instance of "(yeas ## - nays ##)". It needs to go + far back enough to get the motion text found in between "question on" and "was determined", + but I have had to find a sweet spot where it gets the motion text without preventing matches + on proximally preceding votes. + + Here's the solution for getting the bill_ids which I'm currently working on: + + I have a helper function that uses the raw_motion_text essentially as an indexing point to divide + via a split() of the `self.text` on that substring, and then use a regex that gets the last + instance of a bill_id format in the `self.text`, i.e. bill_id_match = re.findall(pattern, text)[-1] + + This should get us the accurate data because the last bill_id occurrence in the text before + the vote lines is the bill that is being voted upon, in every case I have found. + """ + raw_motion_text = match.group("rawmotion") + bill_id_match = self.bill_id_re.search(raw_motion_text) + if not bill_id_match: + # raise Exception(f"no bill id in {raw_motion_text} at {self.source.url}") + self.logger.warn(f"Could not find bill_id at {self.source.url}") + else: + print("\n\n\n" + bill_id_match.group(2) + "\n\n\n") + + motion_text = self.precise_motion_re.search(raw_motion_text).group(1) + single_line_motion = motion_text.replace("\n", " ") + + normalized_motion = single_line_motion.capitalize() + + vote_classification = None + for pattern, classification in self.motion_classification.items(): + if re.compile(pattern).search(single_line_motion): + vote_classification = classification + break + + if not vote_classification: + raise Exception( + f""" + No vote_classification from {single_line_motion}" in journal at {self.source.url} + """ + ) + + # Get the total counts + first_total_yea = int(match.group("firstyeatotal")) + first_total_nay = int(match.group("firstnaytotal")) + total_yea = int(match.group("secondyeatotal")) + total_nay = int(match.group("secondnaytotal")) + + # # Get non-voting total count, but section may be missing + # possible_total_nv = match.group("secondnvtotal") + # if possible_total_nv is None: + # possible_total_nv = "0" + # total_nv = int(possible_total_nv) + + vote_number = match.group("votenumber") + + # Get list of voter names for each section + yea_voters = self.find_names(match.group("yealines")) + nay_voters = self.find_names(match.group("naylines")) + yea_voters.extend(self.find_names(match.group("extrayealines"))) + + # extre nay lines section may be missing + possible_extra_nay_voters = match.group("extranaylines") + if possible_extra_nay_voters is None: + possible_extra_nay_voters = "" + nay_voters.extend(self.find_names(possible_extra_nay_voters)) + + # non-voting voter name section may be missing + possible_nv_voters = match.group("nvlines") + if possible_nv_voters is None: + possible_nv_voters = "" + nv_voters = self.find_names(possible_nv_voters) + + data = dict( + total_yea=total_yea, + total_nay=total_nay, + yea_voters=yea_voters, + nay_voters=nay_voters, + nv_voters=nv_voters, + vote_number=vote_number, + bill_id_match=bill_id_match.group(2) if bill_id_match else None, + normalized_motion=normalized_motion, + ) + self.votes_list.append(data) + print(self.votes_list) + + # TODO: there are a few regex kinks leading to possible miscounts in the vote + # tallies (not more than 3 votes off) which could be a problem when trying to + # use these counts to determine whether a vote passed/failed. + # But given the typical large vote margins between yeas and nays, even the current + # occassional miscounts would rarely lead to a false determination of the vote result. + yea_mismatch = first_total_yea != total_yea + nay_mismatch = first_total_nay != total_nay + if yea_mismatch or nay_mismatch: + print(self.text) + print(self.source.url) + print(f""" + first_total_yea = {first_total_yea} + total_yea = {total_yea} + first_total_nay = {first_total_nay} + total_nay = {total_nay} + yea_mismatch = {yea_mismatch} + nay_mismatch = {nay_mismatch} + {data} + """) + raise Exception("ynmismatch") + + # Check that total voters and total votes match up + yea_matches_miscount = len(yea_voters) - total_yea + nay_matches_miscount = len(nay_voters) - total_nay + for miscount in yea_matches_miscount, nay_matches_miscount: + # Allows for minor miscount in cases of PDF formatting issues + if abs(miscount) > 1: + print(self.text) + print(self.source.url) + print(f""" + yea_voters = {len(yea_voters)} + total_yea = {total_yea} + nay_voters = {len(nay_voters)} + total_nay = {total_nay} + yea_matches_miscount = {yea_matches_miscount} + nay_matches_miscount = {nay_matches_miscount} + {data} + """) + raise Exception("recorded vote totals differ from logs") + + # TODO: comment back in when bill_id and result handling is complete + # vote = VoteEvent( + # chamber="upper", + # legislative_session="193", + # # start_date=vote_date, + # # motion_text=self.motion, + # bill=bill_id, + # result="pass" if vote_passed else "fail", + # classification="passage", + # ) + + return data + + # Finds names in text, ignoring some common phrases and empty lines + def find_names(self, text): + text = [x.strip() for x in text.split("\n")] + text = [x for x in text if x != ""] + names = [x for x in text if not self.not_name_re.match(x) and "," in x] + + return names + + +class HouseVoteRecordParser: + tz = pytz.timezone("US/Eastern") + total_yea_re = re.compile(r"(\d+) yeas", re.IGNORECASE) + total_nay_re = re.compile(r"(\d+) nays", re.IGNORECASE) + total_nv_re = re.compile(r"(\d+) n/v", re.IGNORECASE) + bill_re = re.compile(r"(h|s)\.? ?(\d+) ?(.*)", re.IGNORECASE) + number_re = re.compile(r"no\.? ?(\d+)", re.IGNORECASE) + + def __init__(self, vote_text): + self.votes = [] + self.names = [] + self.time = None + self.total_yea = None + self.total_nay = None + self.total_nv = None + self.bill_id = None + self.vote_number = None + self.motion = None + self.motion_parts = [] + lines = vote_text.split("\n") + self.raw = vote_text + for line in lines: + self.read_line(line) + + def read_line(self, line): + line = line.strip() + + # These lines contain no useful info and are skipped + blank = line in ["\x0c", ""] + contains_equal = "=" in line + yea_and_nay = line == "Yea and Nay" + if blank or contains_equal or yea_and_nay: + pass + + # Check for vote number. When the vote number is found, we can be sure + # that all the motion text has been read. + elif (match := self.number_re.match(line)) is not None: + self.vote_number = int(match.group(1)) + self.motion = " ".join(self.motion_parts) + + # Check for time + elif ":" in line: + when = datetime.datetime.strptime(line, "%m/%d/%Y %I:%M %p") + when = self.tz.localize(when) + self.time = when + + # Check for vote totals + elif (match := self.total_yea_re.match(line)) is not None: + self.total_yea = int(match.group(1)) + + elif (match := self.total_nay_re.match(line)) is not None: + self.total_nay = int(match.group(1)) + + elif (match := self.total_nv_re.match(line)) is not None: + self.total_nv = int(match.group(1)) + + # line is vote type + # Y is sometimes read as P by the pdf reader. + elif line in ["Y", "N", "X", "P"]: + self.votes.append(line) + + # Read the line as motion, motion text may come through as multiple + # lines so append the line to an array. + elif self.vote_number is None: + self.motion_parts.append(line) + + # At this point, the line is assumed to contain a name. + + # Special case where pdf reader mistakenly joins two names together into + # a single line. This can happen if the first name starts with a double + # '--'. This can cause the next name in the list to be joined with + # this line. e.g. "--Jones-Smith" instead of "--Jones--" and "Smith" on + # separate lines. + elif line.startswith("--"): + all_names = [x for x in line[2:].split("-") if x] + self.names.extend(all_names) + + # The line is a single name, but may be surrounded by '--' + else: + self.names.append(line.replace("--", "")) + + # Raises an error or writes warning to logger. Returns true if data is valid + def error_if_invalid(self): + votes_match_names = len(self.names) == len(self.votes) + if not votes_match_names: + raise VoteTotalMismatch() + + def get_warning(self): + # Some votes may not have any motion listed + if not self.motion: + return ( + f"Found vote with no motion listed, skipping vote #{self.vote_number}" + ) + + def createVoteEvent(self): + vote_passed = self.total_yea > self.total_nay + + # Check for bill id in motion text + bill_id = None + if (match := self.bill_re.match(self.motion)) is not None: + bill_id = f"{match.group(1)}{match.group(2)}" + + vote = VoteEvent( + chamber="lower", + legislative_session="193", + start_date=self.time, + motion_text=self.motion, + bill=bill_id, + result="pass" if vote_passed else "fail", + classification="passage", + ) + + vote.set_count("yes", self.total_yea) + vote.set_count("no", self.total_nay) + + vote_dictionary = { + "Y": "yes", + "P": "yes", # Y's can be misread as P's + "N": "no", + "X": "not voting", + } + + # Add all individual votes + for name, vote_val in zip(self.names, self.votes): + vote.vote(vote_dictionary[vote_val], name) + return vote + + +class HouseRollCall(PdfPage): + def process_page(self): + # Each bill vote starts with the same text, so use it as a separator. + separator = "MASSACHUSETTS HOUSE OF REPRESENTATIVES" + + # Ignore first element after split because it's going to be blank + vote_text = self.text.split(separator)[1:] + + for vote in vote_text: + vote_parser = HouseVoteRecordParser(vote) + if (warning := vote_parser.get_warning()) is not None: + self.logger.warn(warning) + else: + vote_parser.error_if_invalid() + vote_event = vote_parser.createVoteEvent() + vote_event.add_source(self.source.url, note="Vote record pdf") + yield vote_event + + +""" + +""" From 0495f25a37c2fc1ba1fb4b6d1149abc0bdc55dd4 Mon Sep 17 00:00:00 2001 From: Christopher Yamas Date: Fri, 26 Jul 2024 09:59:56 -0700 Subject: [PATCH 2/9] MA: create initial vote scraper --- scrapers/ma/votes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py index 7b5b82dade..3815b22115 100644 --- a/scrapers/ma/votes.py +++ b/scrapers/ma/votes.py @@ -106,7 +106,7 @@ class SenateJournal(PdfPage): r"approving.+plan": "passage", } - date_time_re = re.compile(r"sj(\d{8})_(?:\d{3,4}[AM|PM])\.pdf") + date_time_re = re.compile(r"sj(\d{8})_(?:\d{3,4}(?:AM|PM))\.pdf") journal_date = None From 049fe18f3c530fefb70347541944c4a986d90b74 Mon Sep 17 00:00:00 2001 From: Christopher Yamas Date: Fri, 26 Jul 2024 10:13:38 -0700 Subject: [PATCH 3/9] MA: vote data regex fix --- scrapers/ma/votes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py index 3815b22115..73c013d396 100644 --- a/scrapers/ma/votes.py +++ b/scrapers/ma/votes.py @@ -106,7 +106,7 @@ class SenateJournal(PdfPage): r"approving.+plan": "passage", } - date_time_re = re.compile(r"sj(\d{8})_(?:\d{3,4}(?:AM|PM))\.pdf") + date_time_re = re.compile(r"sj(\d{8})_") journal_date = None @@ -120,7 +120,7 @@ def process_date(self): if datetime_match: date_str = datetime_match.group(1) - vote_date = dt.datetime.strptime(date_str, "%Y%m%d") + vote_date = dt.datetime.strptime(date_str, "%m%d%Y") formatted_date = vote_date.strftime("%Y-%m-%d") print("Formatted date:", formatted_date) From e6d588041b848438e68d50d2b23c7f587b61eb5f Mon Sep 17 00:00:00 2001 From: Christopher Yamas Date: Fri, 26 Jul 2024 10:49:49 -0700 Subject: [PATCH 4/9] MA: datetime to dt --- scrapers/ma/votes.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py index 73c013d396..9016f7158c 100644 --- a/scrapers/ma/votes.py +++ b/scrapers/ma/votes.py @@ -90,7 +90,6 @@ class SenateJournal(PdfPage): r"|Provincetown,-wastewater" r")" ) - text = None precise_motion = r"question on\s+(.+)\s+was determined" precise_motion_re = re.compile(precise_motion, re.DOTALL) @@ -108,7 +107,9 @@ class SenateJournal(PdfPage): date_time_re = re.compile(r"sj(\d{8})_") + text = None journal_date = None + bill_id = None def __init__(self, source, votes_list): super().__init__(source=source) @@ -320,6 +321,10 @@ def find_names(self, text): return names + def get_bill_id(self, motion_text): + + + class HouseVoteRecordParser: tz = pytz.timezone("US/Eastern") @@ -363,7 +368,7 @@ def read_line(self, line): # Check for time elif ":" in line: - when = datetime.datetime.strptime(line, "%m/%d/%Y %I:%M %p") + when = dt.datetime.strptime(line, "%m/%d/%Y %I:%M %p") when = self.tz.localize(when) self.time = when From a3715c038abb1a0ae5a6e4a813a94c04f9fba0fe Mon Sep 17 00:00:00 2001 From: Christopher Yamas Date: Fri, 26 Jul 2024 10:55:22 -0700 Subject: [PATCH 5/9] MA: reformat bill_id func --- scrapers/ma/votes.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py index 9016f7158c..6d499e092b 100644 --- a/scrapers/ma/votes.py +++ b/scrapers/ma/votes.py @@ -321,10 +321,6 @@ def find_names(self, text): return names - def get_bill_id(self, motion_text): - - - class HouseVoteRecordParser: tz = pytz.timezone("US/Eastern") From 9db7e17f9b2b33dccd78a6ce97da6caadfa83ea6 Mon Sep 17 00:00:00 2001 From: Christopher Yamas Date: Fri, 26 Jul 2024 13:19:34 -0700 Subject: [PATCH 6/9] MA: add vote_event instantiation --- scrapers/ma/votes.py | 174 ++++++++++++++++++++++++------------------- 1 file changed, 99 insertions(+), 75 deletions(-) diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py index 6d499e092b..aa1327f674 100644 --- a/scrapers/ma/votes.py +++ b/scrapers/ma/votes.py @@ -99,10 +99,12 @@ class SenateJournal(PdfPage): motion_classification = { r"passing.+engross": "engrossment", - r"adoption.+amendment": "amendment-passage", + r"adoption.+amendment": "amendment-adoption", r"acceptance.+report": "report-acceptance", r"passing.+enacted": "passage", r"approving.+plan": "passage", + r"suspension.+Rule": "rule-suspension", + r"adoption.+motion": "motion-adoption", } date_time_re = re.compile(r"sj(\d{8})_") @@ -110,6 +112,7 @@ class SenateJournal(PdfPage): text = None journal_date = None bill_id = None + vote_date = None def __init__(self, source, votes_list): super().__init__(source=source) @@ -124,7 +127,6 @@ def process_date(self): vote_date = dt.datetime.strptime(date_str, "%m%d%Y") formatted_date = vote_date.strftime("%Y-%m-%d") - print("Formatted date:", formatted_date) return formatted_date else: @@ -132,7 +134,7 @@ def process_date(self): f"{self.source.url}") def process_page(self): - vote_date = self.process_date() + self.vote_date = self.process_date() # Remove special characters that look like the - character self.text = self.text.replace("–", "-").replace("−", "-").replace("—", "-") @@ -149,17 +151,26 @@ def process_page(self): f"{self.source.url}\n" f"len(votes_mt):{len(votes_mt)}\n" f"len(votes_s):{len(votes_s)}\n" - f"len(votes_id):{len(votes_id)}\n" - f"{self.text}") + f"len(votes_id):{len(votes_id)}\n") else: # Run full regex search. - votes = self.total_vote_re.finditer(self.text) - votes = [self.parse_match(v) for v in votes] - # yield from - print("\n\n".join([str(x) for x in votes])) - # print(self.source.url) + vote_matches = self.total_vote_re.finditer(self.text) + votes_data_list = [] + + i = 0 + for v_match in vote_matches: + vote = self.parse_match(v_match, i) + votes_data_list.append(vote) + i += 1 + yield vote + + print("\n\n".join([str(x) for x in votes_data_list])) - def parse_match(self, match): + def parse_match(self, match, index): + bill_id = self.get_bill_id(index) + if not bill_id: + self.logger.warn(f"No valid bill id found preceding vote lines in {self.source.url}") + return {} # TODO: to get bill_id, it needs to treat each vote separately # and be able to search backwards in the text for most proximal @@ -191,17 +202,12 @@ def parse_match(self, match): the vote lines is the bill that is being voted upon, in every case I have found. """ raw_motion_text = match.group("rawmotion") - bill_id_match = self.bill_id_re.search(raw_motion_text) - if not bill_id_match: - # raise Exception(f"no bill id in {raw_motion_text} at {self.source.url}") - self.logger.warn(f"Could not find bill_id at {self.source.url}") - else: - print("\n\n\n" + bill_id_match.group(2) + "\n\n\n") motion_text = self.precise_motion_re.search(raw_motion_text).group(1) single_line_motion = motion_text.replace("\n", " ") normalized_motion = single_line_motion.capitalize() + print(normalized_motion) vote_classification = None for pattern, classification in self.motion_classification.items(): @@ -247,71 +253,76 @@ def parse_match(self, match): possible_nv_voters = "" nv_voters = self.find_names(possible_nv_voters) - data = dict( - total_yea=total_yea, - total_nay=total_nay, - yea_voters=yea_voters, - nay_voters=nay_voters, - nv_voters=nv_voters, - vote_number=vote_number, - bill_id_match=bill_id_match.group(2) if bill_id_match else None, - normalized_motion=normalized_motion, - ) - self.votes_list.append(data) - print(self.votes_list) - - # TODO: there are a few regex kinks leading to possible miscounts in the vote - # tallies (not more than 3 votes off) which could be a problem when trying to - # use these counts to determine whether a vote passed/failed. - # But given the typical large vote margins between yeas and nays, even the current - # occassional miscounts would rarely lead to a false determination of the vote result. + # # To help flag certain high priority console logging during debugging + # red_color = '\033[91m' + # reset_color = '\033[0m' + + first_margin = first_total_yea - first_total_nay + final_margin = total_yea - total_nay + abs_first, abs_final = abs(first_margin), abs(final_margin) + if abs_first < abs_final: + determinative_margin = abs_first + vote_passed = True if first_margin > 0 else False + else: + determinative_margin = abs_final + vote_passed = True if final_margin > 0 else False + yea_mismatch = first_total_yea != total_yea nay_mismatch = first_total_nay != total_nay if yea_mismatch or nay_mismatch: - print(self.text) - print(self.source.url) - print(f""" - first_total_yea = {first_total_yea} - total_yea = {total_yea} - first_total_nay = {first_total_nay} - total_nay = {total_nay} - yea_mismatch = {yea_mismatch} - nay_mismatch = {nay_mismatch} - {data} - """) - raise Exception("ynmismatch") - - # Check that total voters and total votes match up + self.logger.warn(f"Cannot accurately parse to determine margins for vote {index + 1} in {self.source.url}") + return {} + # print(self.source.url) + # print(f""" + # first_total_yea = {first_total_yea} + # total_yea = {total_yea} + # first_total_nay = {first_total_nay} + # total_nay = {total_nay} + # yea_mismatch = {yea_mismatch} + # nay_mismatch = {nay_mismatch} + # {data} + # """) + # + # if yea_mismatch: + # if abs(first_total_yea - total_yea) > determinative_margin: + # print(f"{red_color}YEA MISMATCH GREATER THAN DETERMINATIVE MARGIN{reset_color}") + # if nay_mismatch: + # if abs(first_total_nay - total_nay) > determinative_margin: + # print(f"{red_color}NAY MISMATCH GREATER THAN DETERMINATIVE MARGIN{reset_color}") + # + # # Check that total voters and total votes match up yea_matches_miscount = len(yea_voters) - total_yea nay_matches_miscount = len(nay_voters) - total_nay for miscount in yea_matches_miscount, nay_matches_miscount: # Allows for minor miscount in cases of PDF formatting issues - if abs(miscount) > 1: - print(self.text) - print(self.source.url) - print(f""" - yea_voters = {len(yea_voters)} - total_yea = {total_yea} - nay_voters = {len(nay_voters)} - total_nay = {total_nay} - yea_matches_miscount = {yea_matches_miscount} - nay_matches_miscount = {nay_matches_miscount} - {data} - """) - raise Exception("recorded vote totals differ from logs") - - # TODO: comment back in when bill_id and result handling is complete - # vote = VoteEvent( - # chamber="upper", - # legislative_session="193", - # # start_date=vote_date, - # # motion_text=self.motion, - # bill=bill_id, - # result="pass" if vote_passed else "fail", - # classification="passage", - # ) - - return data + if abs(miscount) > determinative_margin: + self.logger.warn( + f"Cannot accurately parse to determine margins for vote {index + 1} in {self.source.url}") + return {} + # print(f"""{red_color} + # MISCOUNT (i.e. total voters and total votes don't match in big way!) + # yea_voters = {len(yea_voters)} + # total_yea = {total_yea} + # nay_voters = {len(nay_voters)} + # total_nay = {total_nay} + # yea_matches_miscount = {yea_matches_miscount} + # nay_matches_miscount = {nay_matches_miscount} + # {data} + # {reset_color}""") + + vote_event = VoteEvent( + chamber="upper", + legislative_session="193", + start_date=self.vote_date, + motion_text=normalized_motion, + bill=bill_id, + result="pass" if vote_passed else "fail", + classification=vote_classification, + ) + + vote_event.add_source(self.source.url) + + return vote_event # Finds names in text, ignoring some common phrases and empty lines def find_names(self, text): @@ -321,6 +332,19 @@ def find_names(self, text): return names + def get_bill_id(self, index): + pre_vote_sections = self.text.split("(yeas")[:-1] + relevant_section = pre_vote_sections[index] + bill_id_match = re.findall(self.bill_id_re, relevant_section) + if bill_id_match: + chamber, number = bill_id_match[-1] + self.bill_id = f"{chamber[0]} {number}" + if self.bill_id: + print(f"BILL ID MATCH: {self.bill_id}") + else: + self.logger.warn(f"No preceding bill id for vote {index + 1} in {self.source.url}") + return self.bill_id + class HouseVoteRecordParser: tz = pytz.timezone("US/Eastern") From 036d7e0e6b8484ce2d1e67e611eb8f326605b462 Mon Sep 17 00:00:00 2001 From: Christopher Yamas Date: Fri, 26 Jul 2024 16:18:44 -0700 Subject: [PATCH 7/9] MA: simplied vote classifications --- scrapers/ma/votes.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py index aa1327f674..ba59c7abd8 100644 --- a/scrapers/ma/votes.py +++ b/scrapers/ma/votes.py @@ -98,13 +98,10 @@ class SenateJournal(PdfPage): bill_id_re = re.compile(bill_id, re.DOTALL) motion_classification = { - r"passing.+engross": "engrossment", - r"adoption.+amendment": "amendment-adoption", - r"acceptance.+report": "report-acceptance", + r"passing.+engross": "passage", + r"adoption.+amendment": "amendment", r"passing.+enacted": "passage", r"approving.+plan": "passage", - r"suspension.+Rule": "rule-suspension", - r"adoption.+motion": "motion-adoption", } date_time_re = re.compile(r"sj(\d{8})_") @@ -216,7 +213,7 @@ def parse_match(self, match, index): break if not vote_classification: - raise Exception( + self.logger.warn( f""" No vote_classification from {single_line_motion}" in journal at {self.source.url} """ From cadff7a79dc7db2172da7ee89f30903aebbf649a Mon Sep 17 00:00:00 2001 From: Christopher Yamas Date: Mon, 29 Jul 2024 08:59:23 -0700 Subject: [PATCH 8/9] MA: removed unnecessary prints, comments, unused vars --- scrapers/ma/votes.py | 48 ++++++++------------------------------------ 1 file changed, 8 insertions(+), 40 deletions(-) diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py index ba59c7abd8..f023a50edb 100644 --- a/scrapers/ma/votes.py +++ b/scrapers/ma/votes.py @@ -161,8 +161,6 @@ def process_page(self): i += 1 yield vote - print("\n\n".join([str(x) for x in votes_data_list])) - def parse_match(self, match, index): bill_id = self.get_bill_id(index) if not bill_id: @@ -204,7 +202,6 @@ def parse_match(self, match, index): single_line_motion = motion_text.replace("\n", " ") normalized_motion = single_line_motion.capitalize() - print(normalized_motion) vote_classification = None for pattern, classification in self.motion_classification.items(): @@ -231,7 +228,7 @@ def parse_match(self, match, index): # possible_total_nv = "0" # total_nv = int(possible_total_nv) - vote_number = match.group("votenumber") + # vote_number = match.group("votenumber") # Get list of voter names for each section yea_voters = self.find_names(match.group("yealines")) @@ -244,11 +241,11 @@ def parse_match(self, match, index): possible_extra_nay_voters = "" nay_voters.extend(self.find_names(possible_extra_nay_voters)) - # non-voting voter name section may be missing - possible_nv_voters = match.group("nvlines") - if possible_nv_voters is None: - possible_nv_voters = "" - nv_voters = self.find_names(possible_nv_voters) + # # non-voting voter name section may be missing + # possible_nv_voters = match.group("nvlines") + # if possible_nv_voters is None: + # possible_nv_voters = "" + # nv_voters = self.find_names(possible_nv_voters) # # To help flag certain high priority console logging during debugging # red_color = '\033[91m' @@ -269,24 +266,7 @@ def parse_match(self, match, index): if yea_mismatch or nay_mismatch: self.logger.warn(f"Cannot accurately parse to determine margins for vote {index + 1} in {self.source.url}") return {} - # print(self.source.url) - # print(f""" - # first_total_yea = {first_total_yea} - # total_yea = {total_yea} - # first_total_nay = {first_total_nay} - # total_nay = {total_nay} - # yea_mismatch = {yea_mismatch} - # nay_mismatch = {nay_mismatch} - # {data} - # """) - # - # if yea_mismatch: - # if abs(first_total_yea - total_yea) > determinative_margin: - # print(f"{red_color}YEA MISMATCH GREATER THAN DETERMINATIVE MARGIN{reset_color}") - # if nay_mismatch: - # if abs(first_total_nay - total_nay) > determinative_margin: - # print(f"{red_color}NAY MISMATCH GREATER THAN DETERMINATIVE MARGIN{reset_color}") - # + # # Check that total voters and total votes match up yea_matches_miscount = len(yea_voters) - total_yea nay_matches_miscount = len(nay_voters) - total_nay @@ -296,16 +276,6 @@ def parse_match(self, match, index): self.logger.warn( f"Cannot accurately parse to determine margins for vote {index + 1} in {self.source.url}") return {} - # print(f"""{red_color} - # MISCOUNT (i.e. total voters and total votes don't match in big way!) - # yea_voters = {len(yea_voters)} - # total_yea = {total_yea} - # nay_voters = {len(nay_voters)} - # total_nay = {total_nay} - # yea_matches_miscount = {yea_matches_miscount} - # nay_matches_miscount = {nay_matches_miscount} - # {data} - # {reset_color}""") vote_event = VoteEvent( chamber="upper", @@ -336,9 +306,7 @@ def get_bill_id(self, index): if bill_id_match: chamber, number = bill_id_match[-1] self.bill_id = f"{chamber[0]} {number}" - if self.bill_id: - print(f"BILL ID MATCH: {self.bill_id}") - else: + if not self.bill_id: self.logger.warn(f"No preceding bill id for vote {index + 1} in {self.source.url}") return self.bill_id From b274fcb226c1862037012ccd3a271aede915bb39 Mon Sep 17 00:00:00 2001 From: Christopher Yamas Date: Mon, 29 Jul 2024 10:28:48 -0700 Subject: [PATCH 9/9] MA: votes file lint fixes --- scrapers/ma/__init__.py | 2 +- scrapers/ma/votes.py | 64 ++++++++++++++++++++++------------------- 2 files changed, 36 insertions(+), 30 deletions(-) diff --git a/scrapers/ma/__init__.py b/scrapers/ma/__init__.py index d550a21830..15dd76c398 100644 --- a/scrapers/ma/__init__.py +++ b/scrapers/ma/__init__.py @@ -11,7 +11,7 @@ class Massachusetts(State): scrapers = { "bills": MABillScraper, "events": MAEventScraper, - "votes": MAVoteScraper + "votes": MAVoteScraper, } legislative_sessions = [ { diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py index f023a50edb..05dec796c5 100644 --- a/scrapers/ma/votes.py +++ b/scrapers/ma/votes.py @@ -47,8 +47,7 @@ def process_page(self): month_links = XPath("//a[@aria-controls='journalList']/@href").match(self.root) for month_link in month_links: vote_events = SenateJournalMonth( - source=URL(month_link, verify=False), - votes_list=self.votes_list + source=URL(month_link, verify=False), votes_list=self.votes_list ).do_scrape() for vote_event in vote_events: # vote_event.add_source(self.source.url, note="Senate jouenal listing") @@ -56,7 +55,6 @@ def process_page(self): class SenateJournalMonth(HtmlListPage): - def __init__(self, source, votes_list): super().__init__(source=source) self.votes_list = votes_list @@ -68,9 +66,9 @@ def process_page(self): # "https://malegislature.gov/Journal/Senate/193/806/sj03302023_1100AM.pdf", # "https://malegislature.gov/Journal/Senate/193/768/sj03232023_0100PM.pdf", # ): - yield SenateJournal(source=URL(journal_pdf_link, verify=False), - votes_list=self.votes_list - ) + yield SenateJournal( + source=URL(journal_pdf_link, verify=False), votes_list=self.votes_list + ) class SenateJournal(PdfPage): @@ -82,14 +80,17 @@ class SenateJournal(PdfPage): vote_id_re = re.compile(vote_id, re.DOTALL) vote_section_re = re.compile(vote_section, re.DOTALL) - total_vote_re = re.compile(f"{motion_and_vote_total}.*?{vote_id}.*?{vote_section}", re.DOTALL) + total_vote_re = re.compile( + f"{motion_and_vote_total}.*?{vote_id}.*?{vote_section}", re.DOTALL + ) - not_name_re = re.compile(r"^(\d|UNCORRECTED|Joint Rules|\.|Real ID,-- homeless" - r"|Pharmacists,-- PrEP\." - r"|Brewster,-- land\." - r"|Provincetown,-wastewater" - r")" - ) + not_name_re = re.compile( + r"^(\d|UNCORRECTED|Joint Rules|\.|Real ID,-- homeless" + r"|Pharmacists,-- PrEP\." + r"|Brewster,-- land\." + r"|Provincetown,-wastewater" + r")" + ) precise_motion = r"question on\s+(.+)\s+was determined" precise_motion_re = re.compile(precise_motion, re.DOTALL) @@ -127,8 +128,9 @@ def process_date(self): return formatted_date else: - raise Exception(f"Datetime with known format not in pdf url: " - f"{self.source.url}") + raise Exception( + f"Datetime with known format not in pdf url: {self.source.url}" + ) def process_page(self): self.vote_date = self.process_date() @@ -144,11 +146,13 @@ def process_page(self): # If they disagree on number of matches, the scraper will not get # the data correctly so emit a warning and skip this pdf. if not (len(votes_mt) == len(votes_s) == len(votes_id)): - self.logger.warn(f"\nCould not accurately parse votes for " - f"{self.source.url}\n" - f"len(votes_mt):{len(votes_mt)}\n" - f"len(votes_s):{len(votes_s)}\n" - f"len(votes_id):{len(votes_id)}\n") + self.logger.warn( + f"\nCould not accurately parse votes for " + f"{self.source.url}\n" + f"len(votes_mt):{len(votes_mt)}\n" + f"len(votes_s):{len(votes_s)}\n" + f"len(votes_id):{len(votes_id)}\n" + ) else: # Run full regex search. vote_matches = self.total_vote_re.finditer(self.text) @@ -164,7 +168,9 @@ def process_page(self): def parse_match(self, match, index): bill_id = self.get_bill_id(index) if not bill_id: - self.logger.warn(f"No valid bill id found preceding vote lines in {self.source.url}") + self.logger.warn( + f"No valid bill id found preceding vote lines in {self.source.url}" + ) return {} # TODO: to get bill_id, it needs to treat each vote separately @@ -264,7 +270,9 @@ def parse_match(self, match, index): yea_mismatch = first_total_yea != total_yea nay_mismatch = first_total_nay != total_nay if yea_mismatch or nay_mismatch: - self.logger.warn(f"Cannot accurately parse to determine margins for vote {index + 1} in {self.source.url}") + self.logger.warn( + f"Cannot accurately parse to determine margins for vote {index + 1} in {self.source.url}" + ) return {} # # Check that total voters and total votes match up @@ -274,7 +282,8 @@ def parse_match(self, match, index): # Allows for minor miscount in cases of PDF formatting issues if abs(miscount) > determinative_margin: self.logger.warn( - f"Cannot accurately parse to determine margins for vote {index + 1} in {self.source.url}") + f"Cannot accurately parse to determine margins for vote {index + 1} in {self.source.url}" + ) return {} vote_event = VoteEvent( @@ -307,7 +316,9 @@ def get_bill_id(self, index): chamber, number = bill_id_match[-1] self.bill_id = f"{chamber[0]} {number}" if not self.bill_id: - self.logger.warn(f"No preceding bill id for vote {index + 1} in {self.source.url}") + self.logger.warn( + f"No preceding bill id for vote {index + 1} in {self.source.url}" + ) return self.bill_id @@ -456,8 +467,3 @@ def process_page(self): vote_event = vote_parser.createVoteEvent() vote_event.add_source(self.source.url, note="Vote record pdf") yield vote_event - - -""" - -"""