From da0ceee1d6222315949274af9db3782366203c2d Mon Sep 17 00:00:00 2001 From: braykuka Date: Mon, 21 Oct 2024 08:21:31 +0200 Subject: [PATCH 1/5] ND: add votes scraping to bills scraper --- scrapers/nd/bills.py | 100 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 99 insertions(+), 1 deletion(-) diff --git a/scrapers/nd/bills.py b/scrapers/nd/bills.py index 0a9bb7958d..d46c39695e 100644 --- a/scrapers/nd/bills.py +++ b/scrapers/nd/bills.py @@ -1,9 +1,12 @@ import logging import re from dateutil import parser -from openstates.scrape import Scraper, Bill +from openstates.scrape import Scraper, Bill, VoteEvent +import pytz from spatula import JsonPage from .actions import NDCategorizer +import lxml.html +import requests class BillList(JsonPage): @@ -11,6 +14,7 @@ class BillList(JsonPage): member_name_re = re.compile(r"^(Sen\.|Rep\.)\s*(.+),\s(.+)") comm_name_re = re.compile(r"^(House|Senate)\s*(.+)") version_name_re = re.compile(r"introduced|engrossment|enrollment") + _tz = pytz.timezone("US/Central") def __init__(self, input_data): super().__init__() @@ -128,6 +132,100 @@ def process_page(self): yield bill + # Get bill-actions url from bill-overview url + action_url = ( + bill_data["url"] + .replace("/bo", "/ba") + .replace("bill-overview", "bill-actions") + ) + + html_content = requests.get(action_url).content + doc = lxml.html.fromstring(html_content) + doc.make_links_absolute(action_url) + votes_list = doc.xpath( + '//div[@aria-labelledby="vote-modal"]//div[@class="modal-content"]' + ) + for vote_modal in votes_list: + motion_text = ( + vote_modal.xpath('.//h5[@class="modal-title"]')[0] + .text_content() + .strip() + ) + date = parser.parse( + vote_modal.xpath( + './/div[@class="modal-body"]/span[@class="float-right"]' + )[0] + .text_content() + .strip() + ) + start_date = self._tz.localize(date) + status = ( + vote_modal.xpath('.//div[@class="modal-body"]/span[@class="bold"]')[ + 0 + ] + .text_content() + .strip() + ) + chamber = "lower" if "house" in status.lower() else "upper" + status = "pass" if "passed" in status.lower() else "fail" + vote = VoteEvent( + chamber=chamber, + start_date=start_date, + motion_text=f"Motion for {motion_text} on {bill_id}", + result=status, + legislative_session=self.input.get("assembly_id"), + # TODO: get all possible classification types, replace below + classification="passage", + bill=bill_id, + bill_chamber="lower" if bill_id[0] == "H" else "upper", + ) + vote.add_source(action_url) + yes_count = ( + vote_modal.xpath( + './/div[@class="modal-body"]/div[./h6[contains(., "Yea")]]/h6' + )[0] + .text_content() + .strip() + .split(" ")[0] + ) + no_count = ( + vote_modal.xpath( + './/div[@class="modal-body"]/div[./h6[contains(., "Nay")]]/h6' + )[0] + .text_content() + .strip() + .split(" ")[0] + ) + other_count = ( + vote_modal.xpath( + './/div[@class="modal-body"]/div[./h6[contains(., "Absent")]]/h6' + )[0] + .text_content() + .strip() + .split(" ")[0] + ) + + vote.set_count("yes", int(yes_count)) + vote.set_count("no", int(no_count)) + vote.set_count("other", int(other_count)) + for vote_div in vote_modal.xpath( + './/div[@class="modal-body"]/div[./h6[contains(., "Yea")]]//a' + ): + voter_name = vote_div.text_content().strip() + vote.yes(voter_name) + for vote_div in vote_modal.xpath( + './/div[@class="modal-body"]/div[./h6[contains(., "Nay")]]//a' + ): + voter_name = vote_div.text_content().strip() + vote.no(voter_name) + for vote_div in vote_modal.xpath( + './/div[@class="modal-body"]/div[./h6[contains(., "Absent")]]//a' + ): + voter_name = vote_div.text_content().strip() + vote.vote("other", voter_name) + + yield vote + class NDBillScraper(Scraper): def scrape(self, session=None): From f1841ae83c6baa2fc65694b9005663770a3ac3fe Mon Sep 17 00:00:00 2001 From: braykuka Date: Wed, 23 Oct 2024 23:20:30 +0200 Subject: [PATCH 2/5] Fix: updates the voter name to the full name --- scrapers/nd/bills.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/scrapers/nd/bills.py b/scrapers/nd/bills.py index d46c39695e..13bed5e8d4 100644 --- a/scrapers/nd/bills.py +++ b/scrapers/nd/bills.py @@ -34,6 +34,22 @@ def create_source_url(self): f"{assembly_session_id}-{year}/data/bills.json" ) + def get_voter_name_from_url(self, url): + name_uri = ( + url.replace("https://ndlegis.gov/biography/", "") + .split("?")[0] + .split("/")[0] + .strip() + ) + + name_words = [] + for w in name_uri.split("-"): + if len(w) == 1: + name_words.append(f"{w}.".title()) + else: + name_words.append(w.title()) + return " ".join(name_words) + def process_page(self): json_response = self.response.json() bills = json_response.get("bills") @@ -208,20 +224,23 @@ def process_page(self): vote.set_count("yes", int(yes_count)) vote.set_count("no", int(no_count)) vote.set_count("other", int(other_count)) - for vote_div in vote_modal.xpath( + for vote_link in vote_modal.xpath( './/div[@class="modal-body"]/div[./h6[contains(., "Yea")]]//a' ): - voter_name = vote_div.text_content().strip() + voter_url = vote_link.attrib["href"] + voter_name = self.get_voter_name_from_url(voter_url) vote.yes(voter_name) - for vote_div in vote_modal.xpath( + for vote_link in vote_modal.xpath( './/div[@class="modal-body"]/div[./h6[contains(., "Nay")]]//a' ): - voter_name = vote_div.text_content().strip() + voter_url = vote_link.attrib["href"] + voter_name = self.get_voter_name_from_url(voter_url) vote.no(voter_name) - for vote_div in vote_modal.xpath( + for vote_link in vote_modal.xpath( './/div[@class="modal-body"]/div[./h6[contains(., "Absent")]]//a' ): - voter_name = vote_div.text_content().strip() + voter_url = vote_link.attrib["href"] + voter_name = self.get_voter_name_from_url(voter_url) vote.vote("other", voter_name) yield vote From 61c10a13ce406d35587f15de47b238b4efe004c8 Mon Sep 17 00:00:00 2001 From: braykuka Date: Wed, 23 Oct 2024 23:30:04 +0200 Subject: [PATCH 3/5] add docs --- scrapers/nd/bills.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scrapers/nd/bills.py b/scrapers/nd/bills.py index 13bed5e8d4..89ed15ffad 100644 --- a/scrapers/nd/bills.py +++ b/scrapers/nd/bills.py @@ -34,7 +34,16 @@ def create_source_url(self): f"{assembly_session_id}-{year}/data/bills.json" ) - def get_voter_name_from_url(self, url): + def get_voter_name_from_url(self, url: str) -> str: + """ + Description: + Get the full name from URL + + Example: + - https://ndlegis.gov/biography/liz-conmy -> Liz Conmy + - https://ndlegis.gov/biography/randy-a-schobinger -> Randy A. Schobinger + + """ name_uri = ( url.replace("https://ndlegis.gov/biography/", "") .split("?")[0] @@ -48,6 +57,7 @@ def get_voter_name_from_url(self, url): name_words.append(f"{w}.".title()) else: name_words.append(w.title()) + return " ".join(name_words) def process_page(self): From 7ed34ecd8a092a7284b5092a8c75738eeb046bc2 Mon Sep 17 00:00:00 2001 From: braykuka Date: Thu, 24 Oct 2024 14:17:27 +0200 Subject: [PATCH 4/5] update a function to get the voter name --- scrapers/nd/bills.py | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/scrapers/nd/bills.py b/scrapers/nd/bills.py index 89ed15ffad..e1737fda74 100644 --- a/scrapers/nd/bills.py +++ b/scrapers/nd/bills.py @@ -14,6 +14,8 @@ class BillList(JsonPage): member_name_re = re.compile(r"^(Sen\.|Rep\.)\s*(.+),\s(.+)") comm_name_re = re.compile(r"^(House|Senate)\s*(.+)") version_name_re = re.compile(r"introduced|engrossment|enrollment") + members_cache = {} + _tz = pytz.timezone("US/Central") def __init__(self, input_data): @@ -50,15 +52,37 @@ def get_voter_name_from_url(self, url: str) -> str: .split("/")[0] .strip() ) + name_words = [w.title() for w in name_uri.split("-")] + if len(name_words) == 3 and len(name_words[1]) == 1: + return "{0} {1}. {2}".format(*name_words) + elif len(name_words) == 3 and len(name_words[1]) > 1: + return "{0} {1}-{2}".format(*name_words) + else: + return " ".join(name_words) + + def get_voter_name_from_url_request(self, url: str) -> str: + """ + Description: + Get the full name from URL Request + + Example: + - https://ndlegis.gov/biography/liz-conmy -> Liz Conmy + - https://ndlegis.gov/biography/randy-a-schobinger -> Randy A. Schobinger + + """ + if url in self.members_cache: + return self.members_cache[url] + + html_content = requests.get(url).content + doc = lxml.html.fromstring(html_content) + doc.make_links_absolute(url) - name_words = [] - for w in name_uri.split("-"): - if len(w) == 1: - name_words.append(f"{w}.".title()) - else: - name_words.append(w.title()) + fullname = doc.xpath("string(//h1)").strip() + self.members_cache[url] = ( + fullname.replace("Representative", "").replace("Senator", "").strip() + ) - return " ".join(name_words) + return fullname def process_page(self): json_response = self.response.json() @@ -238,19 +262,19 @@ def process_page(self): './/div[@class="modal-body"]/div[./h6[contains(., "Yea")]]//a' ): voter_url = vote_link.attrib["href"] - voter_name = self.get_voter_name_from_url(voter_url) + voter_name = self.get_voter_name_from_url_request(voter_url) vote.yes(voter_name) for vote_link in vote_modal.xpath( './/div[@class="modal-body"]/div[./h6[contains(., "Nay")]]//a' ): voter_url = vote_link.attrib["href"] - voter_name = self.get_voter_name_from_url(voter_url) + voter_name = self.get_voter_name_from_url_request(voter_url) vote.no(voter_name) for vote_link in vote_modal.xpath( './/div[@class="modal-body"]/div[./h6[contains(., "Absent")]]//a' ): voter_url = vote_link.attrib["href"] - voter_name = self.get_voter_name_from_url(voter_url) + voter_name = self.get_voter_name_from_url_request(voter_url) vote.vote("other", voter_name) yield vote From d99e12124282e9d2f9c7fcce338307c2dde623a6 Mon Sep 17 00:00:00 2001 From: Jesse Mortenson Date: Thu, 24 Oct 2024 11:39:22 -0600 Subject: [PATCH 5/5] ND: remove unused voter name identification method --- scrapers/nd/bills.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/scrapers/nd/bills.py b/scrapers/nd/bills.py index e1737fda74..86efc0c6b8 100644 --- a/scrapers/nd/bills.py +++ b/scrapers/nd/bills.py @@ -36,30 +36,6 @@ def create_source_url(self): f"{assembly_session_id}-{year}/data/bills.json" ) - def get_voter_name_from_url(self, url: str) -> str: - """ - Description: - Get the full name from URL - - Example: - - https://ndlegis.gov/biography/liz-conmy -> Liz Conmy - - https://ndlegis.gov/biography/randy-a-schobinger -> Randy A. Schobinger - - """ - name_uri = ( - url.replace("https://ndlegis.gov/biography/", "") - .split("?")[0] - .split("/")[0] - .strip() - ) - name_words = [w.title() for w in name_uri.split("-")] - if len(name_words) == 3 and len(name_words[1]) == 1: - return "{0} {1}. {2}".format(*name_words) - elif len(name_words) == 3 and len(name_words[1]) > 1: - return "{0} {1}-{2}".format(*name_words) - else: - return " ".join(name_words) - def get_voter_name_from_url_request(self, url: str) -> str: """ Description: