Skip to content

Commit

Permalink
Marianas Islands Scraper (#4707)
Browse files Browse the repository at this point in the history
Marianas Islands Bill Scraper
  • Loading branch information
showerst authored Oct 13, 2023
1 parent 838f4b6 commit a900326
Show file tree
Hide file tree
Showing 2 changed files with 266 additions and 0 deletions.
39 changes: 39 additions & 0 deletions scrapers/mp/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from openstates.scrape import State
from .bills import MPBillScraper

import requests
import lxml


class NorthernMarianaIslands(State):
scrapers = {
"bills": MPBillScraper,
}
# sessions before 37th are missing many labels
legislative_sessions = [
{
"_scraped_name": "23",
"identifier": "23",
"name": "23rd Commonwealth Legislature",
"start_date": "2023-01-01",
"end_date": "2023-12-31",
"active": True,
},
]
ignored_scraped_sessions = []

def get_session_list(self):

for i in range(0, 23):
self.ignored_scraped_sessions.append(f"{i}")

url = "https://cnmileg.net/house.asp"

cf_headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/108.0.0.0 Safari/537.36" # noqa
}
page = requests.get(url, headers=cf_headers).content
page = lxml.html.fromstring(page)

return page.xpath("//select[@name='legsID']/option/@value")
227 changes: 227 additions & 0 deletions scrapers/mp/bills.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
import cloudscraper
import dateutil
import lxml.html
import pytz
import re

from openstates.scrape import Scraper, Bill


class MPBillScraper(Scraper):
_tz = pytz.timezone("Pacific/Guam")

scraper = cloudscraper.create_scraper()
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0"
}

bill_types = {
"Senate Resolutions:": "SR",
"Senate Bills:": "SB",
"Senate Legislative initiatives:": "SLI",
"Senate Joint Resolutions:": "SJR",
"Senate Local Bills:": "SLB",
"Senate Commemorative Resolutions:": "SR",
}

bill_type_map = {
"HB": "bill",
"SB": "bill",
"HR": "resolution",
"SR": "resolution",
"HLB": "bill",
"SLB": "bill",
"HJR": "joint resolution",
"SJR": "joint resolution",
"HCR": "concurrent resolution",
"HLI": "bill",
"SLI": "bill",
"HLB": "bill",
"SLB": "bill",
"HCommRes": "resolution",
}

def scrape(self, chamber=None, session=None):
chambers = [chamber] if chamber else ["upper", "lower"]
# throwaway request to get our session cookies before the POST
self.scraper.get("https://cnmileg.net/house.asp")

for chamber in chambers:
yield from self.scrape_chamber(chamber, session)

def scrape_chamber(self, chamber, session):

sec_ids = {"upper": "2", "lower": "1"}
data = {
"legtypeID": "A",
"submit": "Go",
"legsID": session,
"secID": sec_ids[chamber],
}
self.info("POST https://cnmileg.net/leglist.asp")

page = self.scraper.post("https://cnmileg.net/leglist.asp", data=data).content
page = lxml.html.fromstring(page)
page.make_links_absolute("https://cnmileg.net/")

for row in page.xpath("//table/tr")[1:]:
bill_url = row.xpath("td[4]/a/@href")[0]
yield from self.scrape_bill(session, chamber, bill_url)

def scrape_bill(self, session, chamber, url):
self.info(f"GET {url}")
page = self.scraper.get(url).content
page = lxml.html.fromstring(page)
page.make_links_absolute(url)

bill_id = self.get_cell_text(page, "Number")

if chamber == "upper":
bill_type = page.xpath("//tr[@class='stshead']/th/text()")[0].strip()
bill_type = self.bill_types[bill_type]
bill_id = f"{bill_type} {bill_id}"

title = self.get_cell_text(page, "Subject/Title")

bill_type = self.bill_type_map[bill_id.split(" ")[0]]

bill = Bill(
identifier=bill_id,
title=title,
legislative_session=session,
chamber=chamber,
classification=bill_type,
)

sponsor = self.get_cell_text(page, "Author")
bill.add_sponsorship(
sponsor, classification="primary", entity_type="person", primary=True
)

version = self.get_cell(page, "Number").xpath("a/@href")[0]
bill.add_version_link(bill_id, version, media_type="application/pdf")

bill.add_source(url)

bill = self.do_action(bill, page, "Date Introduced", chamber, "introduction")
bill = self.do_action(bill, page, "House First Reading", "lower", "reading-1")
bill = self.do_action(bill, page, "House Final Reading", "lower", "reading-2")
bill = self.do_action(bill, page, "Senate First Reading", "upper", "reading-1")
bill = self.do_action(bill, page, "Senate Final Reading", "upper", "reading-2")

last_action = self.get_cell_text(page, "Last Action")
last_updated = self.get_cell_text(page, "Last Updated")

if "withdrawn" in last_action.lower():
bill.add_action(
"Withdrawn",
dateutil.parser.parse(last_updated).strftime("%Y-%m-%d"),
chamber=("upper" if "senate" in last_action.lower() else "lower"),
classification="withdrawal",
)

reports = page.xpath("//a[contains(@url, 'COMMITTEE%20REPORTS')]")
for report in reports:
bill.add_document_link(
report.xpath("text()")[0],
report.xpath("@url"),
media_type="application/pdf",
)

if self.get_cell_text(
page, "Senate Committee"
) == last_action or self.get_cell_text(
page, "Senate Committee"
) == last_action.replace(
"SENATE-", ""
):
bill.add_action(
last_action,
dateutil.parser.parse(last_updated).strftime("%Y-%m-%d"),
chamber="upper",
classification="referral-committee",
)

# e.g. if "JGO" is house com, and "House-JGO" is last action
if self.get_cell_text(
page, "House Committee"
) == last_action or self.get_cell_text(
page, "House Committee"
) == last_action.replace(
"HOUSE-", ""
):
bill.add_action(
last_action,
dateutil.parser.parse(last_updated).strftime("%Y-%m-%d"),
chamber="lower",
classification="referral-committee",
)

if "duly adopted" in last_action.lower():
bill.add_action(
"Duly Adopted",
dateutil.parser.parse(last_updated).strftime("%Y-%m-%d"),
chamber="executive",
classification="enrolled",
)

refs = self.get_cell_text(page, "References")
for line in refs.split("\n"):
if "governor" in line.lower() or re.match(r"g-\s?\d+", line.lower()):
action_date = re.findall(r"\d+\/\d+\/\d+", line)[0]
bill.add_action(
"Sent to Governor",
dateutil.parser.parse(action_date).strftime("%Y-%m-%d"),
chamber="executive",
classification="executive-receipt",
)

# Public Law, or various local laws
if "p.l." in last_action.lower() or ".l.l." in last_action.lower():
bill.add_action(
last_action,
dateutil.parser.parse(last_updated).strftime("%Y-%m-%d"),
chamber="executive",
classification="became-law",
)

if self.get_cell(page, "Last Action").xpath("p/a/@href"):
bill.add_version_link(
last_action,
self.get_cell(page, "Last Action").xpath("p/a/@href")[0],
media_type="application/pdf",
)

if "p.l." in last_action.lower():
bill.add_citation(
"MP Public Laws", last_action, citation_type="chapter"
)

yield bill

# get the table td where the sibling th contains the text
def get_cell(self, page, text: str):
return page.xpath(f"//tr[th[contains(text(), '{text}')]]/td")[0]

def get_cell_text(self, page, text: str) -> str:
return page.xpath(f"string(//tr[th[contains(text(), '{text}')]]/td)").strip()

def do_action(self, bill, page, text, chamber, classification):
action_text = self.get_cell_text(page, text)
if action_text:

try:
action_date = re.findall(r"\d+\/\d+\/\d+", action_text)[0]
except IndexError:
if re.match(r"\d+:\d+:\d+ AM", action_text):
action_date = self.get_cell_text(page, "Last Updated")
else:
return bill

bill.add_action(
text,
dateutil.parser.parse(action_date).strftime("%Y-%m-%d"),
chamber=chamber,
classification=classification,
)
return bill

0 comments on commit a900326

Please sign in to comment.