From 58b674ac6d6acc68e4f1fbf44bcca38126a315ef Mon Sep 17 00:00:00 2001
From: Christopher Yamas <yamas.chris@gmail.com>
Date: Fri, 26 Jul 2024 09:58:47 -0700
Subject: [PATCH 1/9] MA: add vote scraper to init

---
 scrapers/ma/__init__.py |   2 +
 scrapers/ma/votes.py    | 473 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 475 insertions(+)
 create mode 100644 scrapers/ma/votes.py
diff --git a/scrapers/ma/__init__.py b/scrapers/ma/__init__.py
index 46773f46f4..d550a21830 100644
--- a/scrapers/ma/__init__.py
+++ b/scrapers/ma/__init__.py
@@ -4,12 +4,14 @@
 from openstates.scrape import State
 from .bills import MABillScraper
 from .events import MAEventScraper
+from .votes import MAVoteScraper
 
 
 class Massachusetts(State):
     scrapers = {
         "bills": MABillScraper,
         "events": MAEventScraper,
+        "votes": MAVoteScraper
     }
     legislative_sessions = [
         {
diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py
new file mode 100644
index 0000000000..7b5b82dade
--- /dev/null
+++ b/scrapers/ma/votes.py
@@ -0,0 +1,473 @@
+from spatula import (
+    URL,
+    HtmlListPage,
+    PdfPage,
+    HtmlPage,
+    XPath,
+)
+from openstates.scrape import VoteEvent, Scraper
+import re
+import pytz
+import datetime as dt
+
+
+class VoteTotalMismatch(Exception):
+    def __init__(self):
+        super().__init__("Vote total mismatch")
+
+
+class MAVoteScraper(Scraper):
+    def scrape(self, session=None):
+        # yield from HouseJournalDirectory().do_scrape()
+        yield from SenateJournalDirectory().do_scrape()
+
+
+class HouseJournalDirectory(HtmlPage):
+    source = URL("http://malegislature.gov/Journal/House/192", verify=False)
+
+    def process_page(self):
+        # find all links to a file called RollCalls
+        # One of these files exists for each year directory
+        roll_calls = [
+            x for x in XPath("//a/@href").match(self.root) if x.endswith("RollCalls")
+        ]
+        for rc in roll_calls:
+            vote_events = HouseRollCall(source=URL(rc, verify=False)).do_scrape()
+            for vote_event in vote_events:
+                vote_event.add_source(self.source.url, note="House journal listing")
+                yield vote_event
+
+
+class SenateJournalDirectory(HtmlListPage):
+    source = URL("https://malegislature.gov/Journal/Senate", verify=False)
+    votes_list = []
+
+    def process_page(self):
+        # Find all link to each month
+        month_links = XPath("//a[@aria-controls='journalList']/@href").match(self.root)
+        for month_link in month_links:
+            vote_events = SenateJournalMonth(
+                source=URL(month_link, verify=False),
+                votes_list=self.votes_list
+            ).do_scrape()
+            for vote_event in vote_events:
+                # vote_event.add_source(self.source.url, note="Senate jouenal listing")
+                yield vote_event
+
+
+class SenateJournalMonth(HtmlListPage):
+
+    def __init__(self, source, votes_list):
+        super().__init__(source=source)
+        self.votes_list = votes_list
+
+    def process_page(self):
+        journal_pdf_links = XPath("//tr/td/a/@href").match(self.root)
+        for journal_pdf_link in journal_pdf_links:
+            # if journal_pdf_link in (
+            #     "https://malegislature.gov/Journal/Senate/193/806/sj03302023_1100AM.pdf",
+            #     "https://malegislature.gov/Journal/Senate/193/768/sj03232023_0100PM.pdf",
+            # ):
+            yield SenateJournal(source=URL(journal_pdf_link, verify=False),
+                                votes_list=self.votes_list
+                                )
+
+
+class SenateJournal(PdfPage):
+    motion_and_vote_total = r"((?P<rawmotion>.{900}))\(yeas\s*(?P<firstyeatotal>\d+)\s*(?:-|to)\s*nays.(?P<firstnaytotal>\d+)\)"
+    vote_id = r"\[Yeas\.?.and.Nays\.?.No\.?.(?P<votenumber>\d+)\]"
+    vote_section = r"YEAS(?P<yealines>.*?)(?P<secondyeatotal>\d+)\.(?P<extrayealines>.*?)NAYS(?P<naylines>.*?)(?P<secondnaytotal>\d+)(?:(?P<extranaylines>.{0,30}?)ABSENT OR NOT VOTING(?P<nvlines>.*?)(?P<secondnvtotal>\d+))?"
+
+    motion_and_vote_total_re = re.compile(motion_and_vote_total, re.DOTALL)
+    vote_id_re = re.compile(vote_id, re.DOTALL)
+    vote_section_re = re.compile(vote_section, re.DOTALL)
+
+    total_vote_re = re.compile(f"{motion_and_vote_total}.*?{vote_id}.*?{vote_section}", re.DOTALL)
+
+    not_name_re = re.compile(r"^(\d|UNCORRECTED|Joint Rules|\.|Real ID,-- homeless"
+                             r"|Pharmacists,-- PrEP\."
+                             r"|Brewster,-- land\."
+                             r"|Provincetown,-wastewater"
+                             r")"
+                             )
+    text = None
+
+    precise_motion = r"question on\s+(.+)\s+was determined"
+    precise_motion_re = re.compile(precise_motion, re.DOTALL)
+
+    bill_id = r"(House|Senate),\s+No\.\s+(\d+)"
+    bill_id_re = re.compile(bill_id, re.DOTALL)
+
+    motion_classification = {
+        r"passing.+engross": "engrossment",
+        r"adoption.+amendment": "amendment-passage",
+        r"acceptance.+report": "report-acceptance",
+        r"passing.+enacted": "passage",
+        r"approving.+plan": "passage",
+    }
+
+    date_time_re = re.compile(r"sj(\d{8})_(?:\d{3,4}[AM|PM])\.pdf")
+
+    journal_date = None
+
+    def __init__(self, source, votes_list):
+        super().__init__(source=source)
+        self.votes_list = votes_list
+
+    def process_date(self):
+        # Find the match
+        datetime_match = self.date_time_re.search(self.source.url)
+
+        if datetime_match:
+            date_str = datetime_match.group(1)
+            vote_date = dt.datetime.strptime(date_str, "%Y%m%d")
+            formatted_date = vote_date.strftime("%Y-%m-%d")
+
+            print("Formatted date:", formatted_date)
+            return formatted_date
+
+        else:
+            raise Exception(f"Datetime with known format not in pdf url: "
+                            f"{self.source.url}")
+
+    def process_page(self):
+        vote_date = self.process_date()
+
+        # Remove special characters that look like the - character
+        self.text = self.text.replace("–", "-").replace("−", "-").replace("—", "-")
+
+        # Search for each of the three components of the larger regex separately.
+        votes_mt = self.motion_and_vote_total_re.findall(self.text)
+        votes_s = self.vote_section_re.findall(self.text)
+        votes_id = self.vote_id_re.findall(self.text)
+        # Check to make sure they all found the same number of matches
+        # If they disagree on number of matches, the scraper will not get
+        # the data correctly so emit a warning and skip this pdf.
+        if not (len(votes_mt) == len(votes_s) == len(votes_id)):
+            self.logger.warn(f"\nCould not accurately parse votes for "
+                             f"{self.source.url}\n"
+                             f"len(votes_mt):{len(votes_mt)}\n"
+                             f"len(votes_s):{len(votes_s)}\n"
+                             f"len(votes_id):{len(votes_id)}\n"
+                             f"{self.text}")
+        else:
+            # Run full regex search.
+            votes = self.total_vote_re.finditer(self.text)
+            votes = [self.parse_match(v) for v in votes]
+            # yield from
+            print("\n\n".join([str(x) for x in votes]))
+            # print(self.source.url)
+
+    def parse_match(self, match):
+
+        # TODO: to get bill_id, it needs to treat each vote separately
+        #  and be able to search backwards in the text for most proximal
+        #  bill_id format match (regex or another approach may be needed)
+
+        # You can ignore the current code that attempts to get a bill_id match, as its
+        # about to be deprecated by the below described solution.
+
+        """
+        The reason why it can't just grab it in the current solution is you would have to go
+        far back enough into the prior paragraphs to be able to find the bill_id, but not
+        too far back because then it prevents matches on other votes when vote lines are
+        positioned too close together in the PDF. This is easier to do for the motion text
+        than the bill_id
+
+        That is why the motion_and_vote_total regex pattern defined at the top of this class
+        gets 900 characters preceding each instance of "(yeas ## - nays ##)". It needs to go
+        far back enough to get the motion text found in between "question on" and "was determined",
+        but I have had to find a sweet spot where it gets the motion text without preventing matches
+        on proximally preceding votes.
+
+        Here's the solution for getting the bill_ids which I'm currently working on:
+
+        I have a helper function that uses the raw_motion_text essentially as an indexing point to divide
+        via a split() of the `self.text` on that substring, and then use a regex that gets the last
+        instance of a bill_id format in the `self.text`, i.e. bill_id_match = re.findall(pattern, text)[-1]
+
+        This should get us the accurate data because the last bill_id occurrence in the text before
+        the vote lines is the bill that is being voted upon, in every case I have found.
+        """
+        raw_motion_text = match.group("rawmotion")
+        bill_id_match = self.bill_id_re.search(raw_motion_text)
+        if not bill_id_match:
+            # raise Exception(f"no bill id in {raw_motion_text} at {self.source.url}")
+            self.logger.warn(f"Could not find bill_id at {self.source.url}")
+        else:
+            print("\n\n\n" + bill_id_match.group(2) + "\n\n\n")
+
+        motion_text = self.precise_motion_re.search(raw_motion_text).group(1)
+        single_line_motion = motion_text.replace("\n", " ")
+
+        normalized_motion = single_line_motion.capitalize()
+
+        vote_classification = None
+        for pattern, classification in self.motion_classification.items():
+            if re.compile(pattern).search(single_line_motion):
+                vote_classification = classification
+                break
+
+        if not vote_classification:
+            raise Exception(
+                f"""
+                No vote_classification from {single_line_motion}" in journal at {self.source.url}
+                """
+            )
+
+        # Get the total counts
+        first_total_yea = int(match.group("firstyeatotal"))
+        first_total_nay = int(match.group("firstnaytotal"))
+        total_yea = int(match.group("secondyeatotal"))
+        total_nay = int(match.group("secondnaytotal"))
+
+        # # Get non-voting total count, but section may be missing
+        # possible_total_nv = match.group("secondnvtotal")
+        # if possible_total_nv is None:
+        #     possible_total_nv = "0"
+        # total_nv = int(possible_total_nv)
+
+        vote_number = match.group("votenumber")
+
+        # Get list of voter names for each section
+        yea_voters = self.find_names(match.group("yealines"))
+        nay_voters = self.find_names(match.group("naylines"))
+        yea_voters.extend(self.find_names(match.group("extrayealines")))
+
+        # extre nay lines section may be missing
+        possible_extra_nay_voters = match.group("extranaylines")
+        if possible_extra_nay_voters is None:
+            possible_extra_nay_voters = ""
+        nay_voters.extend(self.find_names(possible_extra_nay_voters))
+
+        # non-voting voter name section may be missing
+        possible_nv_voters = match.group("nvlines")
+        if possible_nv_voters is None:
+            possible_nv_voters = ""
+        nv_voters = self.find_names(possible_nv_voters)
+
+        data = dict(
+            total_yea=total_yea,
+            total_nay=total_nay,
+            yea_voters=yea_voters,
+            nay_voters=nay_voters,
+            nv_voters=nv_voters,
+            vote_number=vote_number,
+            bill_id_match=bill_id_match.group(2) if bill_id_match else None,
+            normalized_motion=normalized_motion,
+        )
+        self.votes_list.append(data)
+        print(self.votes_list)
+
+        # TODO: there are a few regex kinks leading to possible miscounts in the vote
+        #  tallies (not more than 3 votes off) which could be a problem when trying to
+        #  use these counts to determine whether a vote passed/failed.
+        #  But given the typical large vote margins between yeas and nays, even the current
+        #  occassional miscounts would rarely lead to a false determination of the vote result.
+        yea_mismatch = first_total_yea != total_yea
+        nay_mismatch = first_total_nay != total_nay
+        if yea_mismatch or nay_mismatch:
+            print(self.text)
+            print(self.source.url)
+            print(f"""
+            first_total_yea = {first_total_yea}
+            total_yea = {total_yea}
+            first_total_nay = {first_total_nay}
+            total_nay = {total_nay}
+            yea_mismatch = {yea_mismatch}
+            nay_mismatch = {nay_mismatch}
+            {data}
+            """)
+            raise Exception("ynmismatch")
+
+        # Check that total voters and total votes match up
+        yea_matches_miscount = len(yea_voters) - total_yea
+        nay_matches_miscount = len(nay_voters) - total_nay
+        for miscount in yea_matches_miscount, nay_matches_miscount:
+            # Allows for minor miscount in cases of PDF formatting issues
+            if abs(miscount) > 1:
+                print(self.text)
+                print(self.source.url)
+                print(f"""
+                            yea_voters = {len(yea_voters)}
+                            total_yea = {total_yea}
+                            nay_voters = {len(nay_voters)}
+                            total_nay = {total_nay}
+                            yea_matches_miscount = {yea_matches_miscount}
+                            nay_matches_miscount = {nay_matches_miscount}
+                            {data}
+                            """)
+                raise Exception("recorded vote totals differ from logs")
+
+        # TODO: comment back in when bill_id and result handling is complete
+        # vote = VoteEvent(
+        #     chamber="upper",
+        #     legislative_session="193",
+        #     # start_date=vote_date,
+        #     # motion_text=self.motion,
+        #     bill=bill_id,
+        #     result="pass" if vote_passed else "fail",
+        #     classification="passage",
+        # )
+
+        return data
+
+    # Finds names in text, ignoring some common phrases and empty lines
+    def find_names(self, text):
+        text = [x.strip() for x in text.split("\n")]
+        text = [x for x in text if x != ""]
+        names = [x for x in text if not self.not_name_re.match(x) and "," in x]
+
+        return names
+
+
+class HouseVoteRecordParser:
+    tz = pytz.timezone("US/Eastern")
+    total_yea_re = re.compile(r"(\d+) yeas", re.IGNORECASE)
+    total_nay_re = re.compile(r"(\d+) nays", re.IGNORECASE)
+    total_nv_re = re.compile(r"(\d+) n/v", re.IGNORECASE)
+    bill_re = re.compile(r"(h|s)\.? ?(\d+) ?(.*)", re.IGNORECASE)
+    number_re = re.compile(r"no\.? ?(\d+)", re.IGNORECASE)
+
+    def __init__(self, vote_text):
+        self.votes = []
+        self.names = []
+        self.time = None
+        self.total_yea = None
+        self.total_nay = None
+        self.total_nv = None
+        self.bill_id = None
+        self.vote_number = None
+        self.motion = None
+        self.motion_parts = []
+        lines = vote_text.split("\n")
+        self.raw = vote_text
+        for line in lines:
+            self.read_line(line)
+
+    def read_line(self, line):
+        line = line.strip()
+
+        # These lines contain no useful info and are skipped
+        blank = line in ["\x0c", ""]
+        contains_equal = "=" in line
+        yea_and_nay = line == "Yea and Nay"
+        if blank or contains_equal or yea_and_nay:
+            pass
+
+        # Check for vote number. When the vote number is found, we can be sure
+        # that all the motion text has been read.
+        elif (match := self.number_re.match(line)) is not None:
+            self.vote_number = int(match.group(1))
+            self.motion = " ".join(self.motion_parts)
+
+        # Check for time
+        elif ":" in line:
+            when = datetime.datetime.strptime(line, "%m/%d/%Y %I:%M %p")
+            when = self.tz.localize(when)
+            self.time = when
+
+        # Check for vote totals
+        elif (match := self.total_yea_re.match(line)) is not None:
+            self.total_yea = int(match.group(1))
+
+        elif (match := self.total_nay_re.match(line)) is not None:
+            self.total_nay = int(match.group(1))
+
+        elif (match := self.total_nv_re.match(line)) is not None:
+            self.total_nv = int(match.group(1))
+
+        # line is vote type
+        # Y is sometimes read as P by the pdf reader.
+        elif line in ["Y", "N", "X", "P"]:
+            self.votes.append(line)
+
+        # Read the line as motion, motion text may come through as multiple
+        # lines so append the line to an array.
+        elif self.vote_number is None:
+            self.motion_parts.append(line)
+
+        # At this point, the line is assumed to contain a name.
+
+        # Special case where pdf reader mistakenly joins two names together into
+        # a single line. This can happen if the first name starts with a double
+        # '--'. This can cause the next name in the list to be joined with
+        # this line. e.g. "--Jones-Smith" instead of "--Jones--" and "Smith" on
+        # separate lines.
+        elif line.startswith("--"):
+            all_names = [x for x in line[2:].split("-") if x]
+            self.names.extend(all_names)
+
+        # The line is a single name, but may be surrounded by '--'
+        else:
+            self.names.append(line.replace("--", ""))
+
+    # Raises an error or writes warning to logger. Returns true if data is valid
+    def error_if_invalid(self):
+        votes_match_names = len(self.names) == len(self.votes)
+        if not votes_match_names:
+            raise VoteTotalMismatch()
+
+    def get_warning(self):
+        # Some votes may not have any motion listed
+        if not self.motion:
+            return (
+                f"Found vote with no motion listed, skipping vote #{self.vote_number}"
+            )
+
+    def createVoteEvent(self):
+        vote_passed = self.total_yea > self.total_nay
+
+        # Check for bill id in motion text
+        bill_id = None
+        if (match := self.bill_re.match(self.motion)) is not None:
+            bill_id = f"{match.group(1)}{match.group(2)}"
+
+        vote = VoteEvent(
+            chamber="lower",
+            legislative_session="193",
+            start_date=self.time,
+            motion_text=self.motion,
+            bill=bill_id,
+            result="pass" if vote_passed else "fail",
+            classification="passage",
+        )
+
+        vote.set_count("yes", self.total_yea)
+        vote.set_count("no", self.total_nay)
+
+        vote_dictionary = {
+            "Y": "yes",
+            "P": "yes",  # Y's can be misread as P's
+            "N": "no",
+            "X": "not voting",
+        }
+
+        # Add all individual votes
+        for name, vote_val in zip(self.names, self.votes):
+            vote.vote(vote_dictionary[vote_val], name)
+        return vote
+
+
+class HouseRollCall(PdfPage):
+    def process_page(self):
+        # Each bill vote starts with the same text, so use it as a separator.
+        separator = "MASSACHUSETTS HOUSE OF REPRESENTATIVES"
+
+        # Ignore first element after split because it's going to be blank
+        vote_text = self.text.split(separator)[1:]
+
+        for vote in vote_text:
+            vote_parser = HouseVoteRecordParser(vote)
+            if (warning := vote_parser.get_warning()) is not None:
+                self.logger.warn(warning)
+            else:
+                vote_parser.error_if_invalid()
+                vote_event = vote_parser.createVoteEvent()
+                vote_event.add_source(self.source.url, note="Vote record pdf")
+                yield vote_event
+
+
+"""
+
+"""

From 0495f25a37c2fc1ba1fb4b6d1149abc0bdc55dd4 Mon Sep 17 00:00:00 2001
From: Christopher Yamas <yamas.chris@gmail.com>
Date: Fri, 26 Jul 2024 09:59:56 -0700
Subject: [PATCH 2/9] MA: create initial vote scraper

---
 scrapers/ma/votes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py
index 7b5b82dade..3815b22115 100644
--- a/scrapers/ma/votes.py
+++ b/scrapers/ma/votes.py
@@ -106,7 +106,7 @@ class SenateJournal(PdfPage):
         r"approving.+plan": "passage",
     }
 
-    date_time_re = re.compile(r"sj(\d{8})_(?:\d{3,4}[AM|PM])\.pdf")
+    date_time_re = re.compile(r"sj(\d{8})_(?:\d{3,4}(?:AM|PM))\.pdf")
 
     journal_date = None
 

From 049fe18f3c530fefb70347541944c4a986d90b74 Mon Sep 17 00:00:00 2001
From: Christopher Yamas <yamas.chris@gmail.com>
Date: Fri, 26 Jul 2024 10:13:38 -0700
Subject: [PATCH 3/9] MA: vote data regex fix

---
 scrapers/ma/votes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py
index 3815b22115..73c013d396 100644
--- a/scrapers/ma/votes.py
+++ b/scrapers/ma/votes.py
@@ -106,7 +106,7 @@ class SenateJournal(PdfPage):
         r"approving.+plan": "passage",
     }
 
-    date_time_re = re.compile(r"sj(\d{8})_(?:\d{3,4}(?:AM|PM))\.pdf")
+    date_time_re = re.compile(r"sj(\d{8})_")
 
     journal_date = None
 
@@ -120,7 +120,7 @@ def process_date(self):
 
         if datetime_match:
             date_str = datetime_match.group(1)
-            vote_date = dt.datetime.strptime(date_str, "%Y%m%d")
+            vote_date = dt.datetime.strptime(date_str, "%m%d%Y")
             formatted_date = vote_date.strftime("%Y-%m-%d")
 
             print("Formatted date:", formatted_date)

From e6d588041b848438e68d50d2b23c7f587b61eb5f Mon Sep 17 00:00:00 2001
From: Christopher Yamas <yamas.chris@gmail.com>
Date: Fri, 26 Jul 2024 10:49:49 -0700
Subject: [PATCH 4/9] MA: datetime to dt

---
 scrapers/ma/votes.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py
index 73c013d396..9016f7158c 100644
--- a/scrapers/ma/votes.py
+++ b/scrapers/ma/votes.py
@@ -90,7 +90,6 @@ class SenateJournal(PdfPage):
                              r"|Provincetown,-wastewater"
                              r")"
                              )
-    text = None
 
     precise_motion = r"question on\s+(.+)\s+was determined"
     precise_motion_re = re.compile(precise_motion, re.DOTALL)
@@ -108,7 +107,9 @@ class SenateJournal(PdfPage):
 
     date_time_re = re.compile(r"sj(\d{8})_")
 
+    text = None
     journal_date = None
+    bill_id = None
 
     def __init__(self, source, votes_list):
         super().__init__(source=source)
@@ -320,6 +321,10 @@ def find_names(self, text):
 
         return names
 
+    def get_bill_id(self, motion_text):
+
+
+
 
 class HouseVoteRecordParser:
     tz = pytz.timezone("US/Eastern")
@@ -363,7 +368,7 @@ def read_line(self, line):
 
         # Check for time
         elif ":" in line:
-            when = datetime.datetime.strptime(line, "%m/%d/%Y %I:%M %p")
+            when = dt.datetime.strptime(line, "%m/%d/%Y %I:%M %p")
             when = self.tz.localize(when)
             self.time = when
 

From a3715c038abb1a0ae5a6e4a813a94c04f9fba0fe Mon Sep 17 00:00:00 2001
From: Christopher Yamas <yamas.chris@gmail.com>
Date: Fri, 26 Jul 2024 10:55:22 -0700
Subject: [PATCH 5/9] MA: reformat bill_id func

---
 scrapers/ma/votes.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py
index 9016f7158c..6d499e092b 100644
--- a/scrapers/ma/votes.py
+++ b/scrapers/ma/votes.py
@@ -321,10 +321,6 @@ def find_names(self, text):
 
         return names
 
-    def get_bill_id(self, motion_text):
-
-
-
 
 class HouseVoteRecordParser:
     tz = pytz.timezone("US/Eastern")

From 9db7e17f9b2b33dccd78a6ce97da6caadfa83ea6 Mon Sep 17 00:00:00 2001
From: Christopher Yamas <yamas.chris@gmail.com>
Date: Fri, 26 Jul 2024 13:19:34 -0700
Subject: [PATCH 6/9] MA: add vote_event instantiation

---
 scrapers/ma/votes.py | 174 ++++++++++++++++++++++++-------------------
 1 file changed, 99 insertions(+), 75 deletions(-)

diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py
index 6d499e092b..aa1327f674 100644
--- a/scrapers/ma/votes.py
+++ b/scrapers/ma/votes.py
@@ -99,10 +99,12 @@ class SenateJournal(PdfPage):
 
     motion_classification = {
         r"passing.+engross": "engrossment",
-        r"adoption.+amendment": "amendment-passage",
+        r"adoption.+amendment": "amendment-adoption",
         r"acceptance.+report": "report-acceptance",
         r"passing.+enacted": "passage",
         r"approving.+plan": "passage",
+        r"suspension.+Rule": "rule-suspension",
+        r"adoption.+motion": "motion-adoption",
     }
 
     date_time_re = re.compile(r"sj(\d{8})_")
@@ -110,6 +112,7 @@ class SenateJournal(PdfPage):
     text = None
     journal_date = None
     bill_id = None
+    vote_date = None
 
     def __init__(self, source, votes_list):
         super().__init__(source=source)
@@ -124,7 +127,6 @@ def process_date(self):
             vote_date = dt.datetime.strptime(date_str, "%m%d%Y")
             formatted_date = vote_date.strftime("%Y-%m-%d")
 
-            print("Formatted date:", formatted_date)
             return formatted_date
 
         else:
@@ -132,7 +134,7 @@ def process_date(self):
                             f"{self.source.url}")
 
     def process_page(self):
-        vote_date = self.process_date()
+        self.vote_date = self.process_date()
 
         # Remove special characters that look like the - character
         self.text = self.text.replace("–", "-").replace("−", "-").replace("—", "-")
@@ -149,17 +151,26 @@ def process_page(self):
                              f"{self.source.url}\n"
                              f"len(votes_mt):{len(votes_mt)}\n"
                              f"len(votes_s):{len(votes_s)}\n"
-                             f"len(votes_id):{len(votes_id)}\n"
-                             f"{self.text}")
+                             f"len(votes_id):{len(votes_id)}\n")
         else:
             # Run full regex search.
-            votes = self.total_vote_re.finditer(self.text)
-            votes = [self.parse_match(v) for v in votes]
-            # yield from
-            print("\n\n".join([str(x) for x in votes]))
-            # print(self.source.url)
+            vote_matches = self.total_vote_re.finditer(self.text)
+            votes_data_list = []
+
+            i = 0
+            for v_match in vote_matches:
+                vote = self.parse_match(v_match, i)
+                votes_data_list.append(vote)
+                i += 1
+                yield vote
+
+            print("\n\n".join([str(x) for x in votes_data_list]))
 
-    def parse_match(self, match):
+    def parse_match(self, match, index):
+        bill_id = self.get_bill_id(index)
+        if not bill_id:
+            self.logger.warn(f"No valid bill id found preceding vote lines in {self.source.url}")
+            return {}
 
         # TODO: to get bill_id, it needs to treat each vote separately
         #  and be able to search backwards in the text for most proximal
@@ -191,17 +202,12 @@ def parse_match(self, match):
         the vote lines is the bill that is being voted upon, in every case I have found.
         """
         raw_motion_text = match.group("rawmotion")
-        bill_id_match = self.bill_id_re.search(raw_motion_text)
-        if not bill_id_match:
-            # raise Exception(f"no bill id in {raw_motion_text} at {self.source.url}")
-            self.logger.warn(f"Could not find bill_id at {self.source.url}")
-        else:
-            print("\n\n\n" + bill_id_match.group(2) + "\n\n\n")
 
         motion_text = self.precise_motion_re.search(raw_motion_text).group(1)
         single_line_motion = motion_text.replace("\n", " ")
 
         normalized_motion = single_line_motion.capitalize()
+        print(normalized_motion)
 
         vote_classification = None
         for pattern, classification in self.motion_classification.items():
@@ -247,71 +253,76 @@ def parse_match(self, match):
             possible_nv_voters = ""
         nv_voters = self.find_names(possible_nv_voters)
 
-        data = dict(
-            total_yea=total_yea,
-            total_nay=total_nay,
-            yea_voters=yea_voters,
-            nay_voters=nay_voters,
-            nv_voters=nv_voters,
-            vote_number=vote_number,
-            bill_id_match=bill_id_match.group(2) if bill_id_match else None,
-            normalized_motion=normalized_motion,
-        )
-        self.votes_list.append(data)
-        print(self.votes_list)
-
-        # TODO: there are a few regex kinks leading to possible miscounts in the vote
-        #  tallies (not more than 3 votes off) which could be a problem when trying to
-        #  use these counts to determine whether a vote passed/failed.
-        #  But given the typical large vote margins between yeas and nays, even the current
-        #  occassional miscounts would rarely lead to a false determination of the vote result.
+        # # To help flag certain high priority console logging during debugging
+        # red_color = '\033[91m'
+        # reset_color = '\033[0m'
+
+        first_margin = first_total_yea - first_total_nay
+        final_margin = total_yea - total_nay
+        abs_first, abs_final = abs(first_margin), abs(final_margin)
+        if abs_first < abs_final:
+            determinative_margin = abs_first
+            vote_passed = True if first_margin > 0 else False
+        else:
+            determinative_margin = abs_final
+            vote_passed = True if final_margin > 0 else False
+
         yea_mismatch = first_total_yea != total_yea
         nay_mismatch = first_total_nay != total_nay
         if yea_mismatch or nay_mismatch:
-            print(self.text)
-            print(self.source.url)
-            print(f"""
-            first_total_yea = {first_total_yea}
-            total_yea = {total_yea}
-            first_total_nay = {first_total_nay}
-            total_nay = {total_nay}
-            yea_mismatch = {yea_mismatch}
-            nay_mismatch = {nay_mismatch}
-            {data}
-            """)
-            raise Exception("ynmismatch")
-
-        # Check that total voters and total votes match up
+            self.logger.warn(f"Cannot accurately parse to determine margins for vote {index + 1} in {self.source.url}")
+            return {}
+            # print(self.source.url)
+            # print(f"""
+            # first_total_yea = {first_total_yea}
+            # total_yea = {total_yea}
+            # first_total_nay = {first_total_nay}
+            # total_nay = {total_nay}
+            # yea_mismatch = {yea_mismatch}
+            # nay_mismatch = {nay_mismatch}
+            # {data}
+            # """)
+            #
+            # if yea_mismatch:
+            #     if abs(first_total_yea - total_yea) > determinative_margin:
+            #         print(f"{red_color}YEA MISMATCH GREATER THAN DETERMINATIVE MARGIN{reset_color}")
+            # if nay_mismatch:
+            #     if abs(first_total_nay - total_nay) > determinative_margin:
+            #         print(f"{red_color}NAY MISMATCH GREATER THAN DETERMINATIVE MARGIN{reset_color}")
+        #
+        # # Check that total voters and total votes match up
         yea_matches_miscount = len(yea_voters) - total_yea
         nay_matches_miscount = len(nay_voters) - total_nay
         for miscount in yea_matches_miscount, nay_matches_miscount:
             # Allows for minor miscount in cases of PDF formatting issues
-            if abs(miscount) > 1:
-                print(self.text)
-                print(self.source.url)
-                print(f"""
-                            yea_voters = {len(yea_voters)}
-                            total_yea = {total_yea}
-                            nay_voters = {len(nay_voters)}
-                            total_nay = {total_nay}
-                            yea_matches_miscount = {yea_matches_miscount}
-                            nay_matches_miscount = {nay_matches_miscount}
-                            {data}
-                            """)
-                raise Exception("recorded vote totals differ from logs")
-
-        # TODO: comment back in when bill_id and result handling is complete
-        # vote = VoteEvent(
-        #     chamber="upper",
-        #     legislative_session="193",
-        #     # start_date=vote_date,
-        #     # motion_text=self.motion,
-        #     bill=bill_id,
-        #     result="pass" if vote_passed else "fail",
-        #     classification="passage",
-        # )
-
-        return data
+            if abs(miscount) > determinative_margin:
+                self.logger.warn(
+                    f"Cannot accurately parse to determine margins for vote {index + 1} in {self.source.url}")
+                return {}
+                # print(f"""{red_color}
+                #             MISCOUNT (i.e. total voters and total votes don't match in big way!)
+                #             yea_voters = {len(yea_voters)}
+                #             total_yea = {total_yea}
+                #             nay_voters = {len(nay_voters)}
+                #             total_nay = {total_nay}
+                #             yea_matches_miscount = {yea_matches_miscount}
+                #             nay_matches_miscount = {nay_matches_miscount}
+                #             {data}
+                #             {reset_color}""")
+
+        vote_event = VoteEvent(
+            chamber="upper",
+            legislative_session="193",
+            start_date=self.vote_date,
+            motion_text=normalized_motion,
+            bill=bill_id,
+            result="pass" if vote_passed else "fail",
+            classification=vote_classification,
+        )
+
+        vote_event.add_source(self.source.url)
+
+        return vote_event
 
     # Finds names in text, ignoring some common phrases and empty lines
     def find_names(self, text):
@@ -321,6 +332,19 @@ def find_names(self, text):
 
         return names
 
+    def get_bill_id(self, index):
+        pre_vote_sections = self.text.split("(yeas")[:-1]
+        relevant_section = pre_vote_sections[index]
+        bill_id_match = re.findall(self.bill_id_re, relevant_section)
+        if bill_id_match:
+            chamber, number = bill_id_match[-1]
+            self.bill_id = f"{chamber[0]} {number}"
+        if self.bill_id:
+            print(f"BILL ID MATCH: {self.bill_id}")
+        else:
+            self.logger.warn(f"No preceding bill id for vote {index + 1} in {self.source.url}")
+        return self.bill_id
+
 
 class HouseVoteRecordParser:
     tz = pytz.timezone("US/Eastern")

From 036d7e0e6b8484ce2d1e67e611eb8f326605b462 Mon Sep 17 00:00:00 2001
From: Christopher Yamas <yamas.chris@gmail.com>
Date: Fri, 26 Jul 2024 16:18:44 -0700
Subject: [PATCH 7/9] MA: simplied vote classifications

---
 scrapers/ma/votes.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py
index aa1327f674..ba59c7abd8 100644
--- a/scrapers/ma/votes.py
+++ b/scrapers/ma/votes.py
@@ -98,13 +98,10 @@ class SenateJournal(PdfPage):
     bill_id_re = re.compile(bill_id, re.DOTALL)
 
     motion_classification = {
-        r"passing.+engross": "engrossment",
-        r"adoption.+amendment": "amendment-adoption",
-        r"acceptance.+report": "report-acceptance",
+        r"passing.+engross": "passage",
+        r"adoption.+amendment": "amendment",
         r"passing.+enacted": "passage",
         r"approving.+plan": "passage",
-        r"suspension.+Rule": "rule-suspension",
-        r"adoption.+motion": "motion-adoption",
     }
 
     date_time_re = re.compile(r"sj(\d{8})_")
@@ -216,7 +213,7 @@ def parse_match(self, match, index):
                 break
 
         if not vote_classification:
-            raise Exception(
+            self.logger.warn(
                 f"""
                 No vote_classification from {single_line_motion}" in journal at {self.source.url}
                 """

From cadff7a79dc7db2172da7ee89f30903aebbf649a Mon Sep 17 00:00:00 2001
From: Christopher Yamas <yamas.chris@gmail.com>
Date: Mon, 29 Jul 2024 08:59:23 -0700
Subject: [PATCH 8/9] MA: removed unnecessary prints, comments, unused vars

---
 scrapers/ma/votes.py | 48 ++++++++------------------------------------
 1 file changed, 8 insertions(+), 40 deletions(-)

diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py
index ba59c7abd8..f023a50edb 100644
--- a/scrapers/ma/votes.py
+++ b/scrapers/ma/votes.py
@@ -161,8 +161,6 @@ def process_page(self):
                 i += 1
                 yield vote
 
-            print("\n\n".join([str(x) for x in votes_data_list]))
-
     def parse_match(self, match, index):
         bill_id = self.get_bill_id(index)
         if not bill_id:
@@ -204,7 +202,6 @@ def parse_match(self, match, index):
         single_line_motion = motion_text.replace("\n", " ")
 
         normalized_motion = single_line_motion.capitalize()
-        print(normalized_motion)
 
         vote_classification = None
         for pattern, classification in self.motion_classification.items():
@@ -231,7 +228,7 @@ def parse_match(self, match, index):
         #     possible_total_nv = "0"
         # total_nv = int(possible_total_nv)
 
-        vote_number = match.group("votenumber")
+        # vote_number = match.group("votenumber")
 
         # Get list of voter names for each section
         yea_voters = self.find_names(match.group("yealines"))
@@ -244,11 +241,11 @@ def parse_match(self, match, index):
             possible_extra_nay_voters = ""
         nay_voters.extend(self.find_names(possible_extra_nay_voters))
 
-        # non-voting voter name section may be missing
-        possible_nv_voters = match.group("nvlines")
-        if possible_nv_voters is None:
-            possible_nv_voters = ""
-        nv_voters = self.find_names(possible_nv_voters)
+        # # non-voting voter name section may be missing
+        # possible_nv_voters = match.group("nvlines")
+        # if possible_nv_voters is None:
+        #     possible_nv_voters = ""
+        # nv_voters = self.find_names(possible_nv_voters)
 
         # # To help flag certain high priority console logging during debugging
         # red_color = '\033[91m'
@@ -269,24 +266,7 @@ def parse_match(self, match, index):
         if yea_mismatch or nay_mismatch:
             self.logger.warn(f"Cannot accurately parse to determine margins for vote {index + 1} in {self.source.url}")
             return {}
-            # print(self.source.url)
-            # print(f"""
-            # first_total_yea = {first_total_yea}
-            # total_yea = {total_yea}
-            # first_total_nay = {first_total_nay}
-            # total_nay = {total_nay}
-            # yea_mismatch = {yea_mismatch}
-            # nay_mismatch = {nay_mismatch}
-            # {data}
-            # """)
-            #
-            # if yea_mismatch:
-            #     if abs(first_total_yea - total_yea) > determinative_margin:
-            #         print(f"{red_color}YEA MISMATCH GREATER THAN DETERMINATIVE MARGIN{reset_color}")
-            # if nay_mismatch:
-            #     if abs(first_total_nay - total_nay) > determinative_margin:
-            #         print(f"{red_color}NAY MISMATCH GREATER THAN DETERMINATIVE MARGIN{reset_color}")
-        #
+
         # # Check that total voters and total votes match up
         yea_matches_miscount = len(yea_voters) - total_yea
         nay_matches_miscount = len(nay_voters) - total_nay
@@ -296,16 +276,6 @@ def parse_match(self, match, index):
                 self.logger.warn(
                     f"Cannot accurately parse to determine margins for vote {index + 1} in {self.source.url}")
                 return {}
-                # print(f"""{red_color}
-                #             MISCOUNT (i.e. total voters and total votes don't match in big way!)
-                #             yea_voters = {len(yea_voters)}
-                #             total_yea = {total_yea}
-                #             nay_voters = {len(nay_voters)}
-                #             total_nay = {total_nay}
-                #             yea_matches_miscount = {yea_matches_miscount}
-                #             nay_matches_miscount = {nay_matches_miscount}
-                #             {data}
-                #             {reset_color}""")
 
         vote_event = VoteEvent(
             chamber="upper",
@@ -336,9 +306,7 @@ def get_bill_id(self, index):
         if bill_id_match:
             chamber, number = bill_id_match[-1]
             self.bill_id = f"{chamber[0]} {number}"
-        if self.bill_id:
-            print(f"BILL ID MATCH: {self.bill_id}")
-        else:
+        if not self.bill_id:
             self.logger.warn(f"No preceding bill id for vote {index + 1} in {self.source.url}")
         return self.bill_id
 

From b274fcb226c1862037012ccd3a271aede915bb39 Mon Sep 17 00:00:00 2001
From: Christopher Yamas <yamas.chris@gmail.com>
Date: Mon, 29 Jul 2024 10:28:48 -0700
Subject: [PATCH 9/9] MA: votes file lint fixes

---
 scrapers/ma/__init__.py |  2 +-
 scrapers/ma/votes.py    | 64 ++++++++++++++++++++++-------------------
 2 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/scrapers/ma/__init__.py b/scrapers/ma/__init__.py
index d550a21830..15dd76c398 100644
--- a/scrapers/ma/__init__.py
+++ b/scrapers/ma/__init__.py
@@ -11,7 +11,7 @@ class Massachusetts(State):
     scrapers = {
         "bills": MABillScraper,
         "events": MAEventScraper,
-        "votes": MAVoteScraper
+        "votes": MAVoteScraper,
     }
     legislative_sessions = [
         {
diff --git a/scrapers/ma/votes.py b/scrapers/ma/votes.py
index f023a50edb..05dec796c5 100644
--- a/scrapers/ma/votes.py
+++ b/scrapers/ma/votes.py
@@ -47,8 +47,7 @@ def process_page(self):
         month_links = XPath("//a[@aria-controls='journalList']/@href").match(self.root)
         for month_link in month_links:
             vote_events = SenateJournalMonth(
-                source=URL(month_link, verify=False),
-                votes_list=self.votes_list
+                source=URL(month_link, verify=False), votes_list=self.votes_list
             ).do_scrape()
             for vote_event in vote_events:
                 # vote_event.add_source(self.source.url, note="Senate jouenal listing")
@@ -56,7 +55,6 @@ def process_page(self):
 
 
 class SenateJournalMonth(HtmlListPage):
-
     def __init__(self, source, votes_list):
         super().__init__(source=source)
         self.votes_list = votes_list
@@ -68,9 +66,9 @@ def process_page(self):
             #     "https://malegislature.gov/Journal/Senate/193/806/sj03302023_1100AM.pdf",
             #     "https://malegislature.gov/Journal/Senate/193/768/sj03232023_0100PM.pdf",
             # ):
-            yield SenateJournal(source=URL(journal_pdf_link, verify=False),
-                                votes_list=self.votes_list
-                                )
+            yield SenateJournal(
+                source=URL(journal_pdf_link, verify=False), votes_list=self.votes_list
+            )
 
 
 class SenateJournal(PdfPage):
@@ -82,14 +80,17 @@ class SenateJournal(PdfPage):
     vote_id_re = re.compile(vote_id, re.DOTALL)
     vote_section_re = re.compile(vote_section, re.DOTALL)
 
-    total_vote_re = re.compile(f"{motion_and_vote_total}.*?{vote_id}.*?{vote_section}", re.DOTALL)
+    total_vote_re = re.compile(
+        f"{motion_and_vote_total}.*?{vote_id}.*?{vote_section}", re.DOTALL
+    )
 
-    not_name_re = re.compile(r"^(\d|UNCORRECTED|Joint Rules|\.|Real ID,-- homeless"
-                             r"|Pharmacists,-- PrEP\."
-                             r"|Brewster,-- land\."
-                             r"|Provincetown,-wastewater"
-                             r")"
-                             )
+    not_name_re = re.compile(
+        r"^(\d|UNCORRECTED|Joint Rules|\.|Real ID,-- homeless"
+        r"|Pharmacists,-- PrEP\."
+        r"|Brewster,-- land\."
+        r"|Provincetown,-wastewater"
+        r")"
+    )
 
     precise_motion = r"question on\s+(.+)\s+was determined"
     precise_motion_re = re.compile(precise_motion, re.DOTALL)
@@ -127,8 +128,9 @@ def process_date(self):
             return formatted_date
 
         else:
-            raise Exception(f"Datetime with known format not in pdf url: "
-                            f"{self.source.url}")
+            raise Exception(
+                f"Datetime with known format not in pdf url: {self.source.url}"
+            )
 
     def process_page(self):
         self.vote_date = self.process_date()
@@ -144,11 +146,13 @@ def process_page(self):
         # If they disagree on number of matches, the scraper will not get
         # the data correctly so emit a warning and skip this pdf.
         if not (len(votes_mt) == len(votes_s) == len(votes_id)):
-            self.logger.warn(f"\nCould not accurately parse votes for "
-                             f"{self.source.url}\n"
-                             f"len(votes_mt):{len(votes_mt)}\n"
-                             f"len(votes_s):{len(votes_s)}\n"
-                             f"len(votes_id):{len(votes_id)}\n")
+            self.logger.warn(
+                f"\nCould not accurately parse votes for "
+                f"{self.source.url}\n"
+                f"len(votes_mt):{len(votes_mt)}\n"
+                f"len(votes_s):{len(votes_s)}\n"
+                f"len(votes_id):{len(votes_id)}\n"
+            )
         else:
             # Run full regex search.
             vote_matches = self.total_vote_re.finditer(self.text)
@@ -164,7 +168,9 @@ def process_page(self):
     def parse_match(self, match, index):
         bill_id = self.get_bill_id(index)
         if not bill_id:
-            self.logger.warn(f"No valid bill id found preceding vote lines in {self.source.url}")
+            self.logger.warn(
+                f"No valid bill id found preceding vote lines in {self.source.url}"
+            )
             return {}
 
         # TODO: to get bill_id, it needs to treat each vote separately
@@ -264,7 +270,9 @@ def parse_match(self, match, index):
         yea_mismatch = first_total_yea != total_yea
         nay_mismatch = first_total_nay != total_nay
         if yea_mismatch or nay_mismatch:
-            self.logger.warn(f"Cannot accurately parse to determine margins for vote {index + 1} in {self.source.url}")
+            self.logger.warn(
+                f"Cannot accurately parse to determine margins for vote {index + 1} in {self.source.url}"
+            )
             return {}
 
         # # Check that total voters and total votes match up
@@ -274,7 +282,8 @@ def parse_match(self, match, index):
             # Allows for minor miscount in cases of PDF formatting issues
             if abs(miscount) > determinative_margin:
                 self.logger.warn(
-                    f"Cannot accurately parse to determine margins for vote {index + 1} in {self.source.url}")
+                    f"Cannot accurately parse to determine margins for vote {index + 1} in {self.source.url}"
+                )
                 return {}
 
         vote_event = VoteEvent(
@@ -307,7 +316,9 @@ def get_bill_id(self, index):
             chamber, number = bill_id_match[-1]
             self.bill_id = f"{chamber[0]} {number}"
         if not self.bill_id:
-            self.logger.warn(f"No preceding bill id for vote {index + 1} in {self.source.url}")
+            self.logger.warn(
+                f"No preceding bill id for vote {index + 1} in {self.source.url}"
+            )
         return self.bill_id
 
 
@@ -456,8 +467,3 @@ def process_page(self):
                 vote_event = vote_parser.createVoteEvent()
                 vote_event.add_source(self.source.url, note="Vote record pdf")
                 yield vote_event
-
-
-"""
-
-"""