diff --git a/scrapers/ks/__init__.py b/scrapers/ks/__init__.py index 9e80fbfe41..126654c7c1 100644 --- a/scrapers/ks/__init__.py +++ b/scrapers/ks/__init__.py @@ -2,6 +2,7 @@ from openstates.scrape import State from .bills import KSBillScraper from .events import KSEventScraper +from .votes import KSVoteScraper # Kansas API's 429 error response includes: @@ -15,6 +16,7 @@ class Kansas(State): scrapers = { "bills": KSBillScraper, "events": KSEventScraper, + "votes": KSVoteScraper, } legislative_sessions = [ { diff --git a/scrapers/ks/votes.py b/scrapers/ks/votes.py new file mode 100644 index 0000000000..5cab462888 --- /dev/null +++ b/scrapers/ks/votes.py @@ -0,0 +1,128 @@ +import re +import datetime +import requests +import feedparser + +import lxml.html +from openstates.scrape import Scraper, VoteEvent + + +class KSVoteScraper(Scraper): + special_slugs = {"2020S1": "li_2020s", "2021S1": "li_2021s"} + + def scrape(self, session=None): + yield from self.scrape_bill_list(session) + + def scrape_bill_list(self, session): + meta = next( + each + for each in self.jurisdiction.legislative_sessions + if each["identifier"] == session + ) + if meta["classification"] == "special": + list_slug = self.special_slugs[session] + else: + list_slug = "li" + + list_url = f"https://kslegislature.org/{list_slug}/data/feeds/rss/bill_info.xml" + xml = self.get(list_url).content + feed = feedparser.parse(xml) + for item in feed.entries: + yield from self.scrape_vote_from_bill(session, item.title, item.guid) + + def scrape_vote_from_bill(self, session, bill, url): + doc = lxml.html.fromstring(self.get(url, retry_on_404=True).text) + doc.make_links_absolute(url) + all_links = doc.xpath( + "//table[@class='bottom']/tbody[@class='tab-content-sub']/tr/td/a/@href" + ) + vote_members_urls = [] + for i in all_links: + if "vote_view" in i: + vote_members_urls.append(str(i)) + if len(vote_members_urls) > 0: + for link in vote_members_urls: + yield from self.parse_vote(bill, link, session) + + def parse_vote(self, bill, link, session): + # Server sometimes sends proper error headers, + # sometimes not + try: + self.info("Get {}".format(link)) + text = requests.get(link).text + except requests.exceptions.HTTPError as err: + self.warning("{} fetching vote {}, skipping".format(err, link)) + return + + if "Varnish cache server" in text: + self.warning( + "Scrape rate is too high, try re-scraping with " + "The --rpm set to a lower number" + ) + return + + if "Page Not Found" in text or "Page Unavailable" in text: + self.warning("missing vote, skipping") + return + member_doc = lxml.html.fromstring(text) + motion = member_doc.xpath("//div[@id='main_content']/h4/text()") + chamber_date_line = "".join( + member_doc.xpath("//div[@id='main_content']/h3[1]//text()") + ) + chamber_date_line_words = chamber_date_line.split() + vote_chamber = chamber_date_line_words[0] + vote_date = datetime.datetime.strptime(chamber_date_line_words[-1], "%m/%d/%Y") + vote_status = " ".join(chamber_date_line_words[2:-2]) + opinions = member_doc.xpath( + "//div[@id='main_content']/h3[position() > 1]/text()" + ) + if len(opinions) > 0: + vote_status = vote_status if vote_status.strip() else motion[0] + vote_chamber = "upper" if vote_chamber == "Senate" else "lower" + + for i in opinions: + try: + count = int(i[i.find("(") + 1 : i.find(")")]) + except ValueError: + # This is likely not a vote-count text chunk + # It's probably '`On roll call the vote was:` + pass + else: + if "yea" in i.lower(): + yes_count = count + elif "nay" in i.lower(): + no_count = count + elif "present" in i.lower(): + p_count = count + elif "absent" in i.lower(): + a_count = count + + vote = VoteEvent( + bill=bill, + start_date=vote_date.strftime("%Y-%m-%d"), + chamber=vote_chamber, + motion_text=vote_status, + legislative_session=session, + result="pass" if yes_count > no_count else "fail", + classification="passage", + ) + vote.dedupe_key = link + + vote.set_count("yes", yes_count) + vote.set_count("no", no_count) + vote.set_count("abstain", p_count) + vote.set_count("absent", a_count) + + vote.add_source(link) + + a_links = member_doc.xpath("//div[@id='main_content']/a/text()") + for i in range(1, len(a_links)): + if i <= yes_count: + vote.vote("yes", re.sub(",", "", a_links[i]).split()[0]) + elif no_count != 0 and i > yes_count and i <= yes_count + no_count: + vote.vote("no", re.sub(",", "", a_links[i]).split()[0]) + else: + vote.vote("other", re.sub(",", "", a_links[i]).split()[0]) + yield vote + else: + self.warning("No Votes for: %s", link)