Skip to content

Commit

Permalink
Merge pull request #5066 from braykuka/PA-date-parsing-issue
Browse files Browse the repository at this point in the history
PA: Date and URL issue
  • Loading branch information
jessemortenson authored Oct 31, 2024
2 parents adb83b4 + d3969d1 commit 1f6867b
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 47 deletions.
11 changes: 7 additions & 4 deletions scrapers/pa/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,10 @@ def get_session_list(self):
self.ignored_scraped_sessions.append("{} Regular Session".format(i))
self.ignored_scraped_sessions.append("{} Special Session #1".format(i))

return url_xpath(
"http://www.legis.state.pa.us/cfdocs/legis/home/bills/",
'//select[@id="billSessions"]/option/text()',
)
return [
session.strip()
for session in url_xpath(
"https://www.palegis.us/legislation/bills",
'//select[@id="sessionSelect"]/option/text()',
)
]
104 changes: 62 additions & 42 deletions scrapers/pa/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,28 @@ def scrape(self, chamber=None, session=None):

def scrape_session(self, chamber, session, special=0):
url = utils.bill_list_url(chamber, session, special)
page = self.get_page(url)

page = self.get(url).text
page = lxml.html.fromstring(page)
page.make_links_absolute(url)

RETRY_TIMES = 5
for link in page.xpath('//a[@class="bill"]'):
yield from self.parse_bill(chamber, session, special, link)
is_parsed = False
for retry_time in range(0, RETRY_TIMES):
try:
yield from self.parse_bill(chamber, session, special, link)
is_parsed = True
break
except Exception as e:
self.logger.warning(
"There was an error in scraping {}: Retry {}: Error: {}".format(
link.attrib["href"], retry_time + 1, e
)
)
if not is_parsed:
self.logger.error(
"Bill {} did not scrape due to the page scraping error. Skip".format(
link.text.strip()
)
)

def parse_bill(self, chamber, session, special, link):
bill_id = link.text.strip()
Expand All @@ -46,9 +61,7 @@ def parse_bill(self, chamber, session, special, link):
btype = ["resolution"]

url = utils.info_url(session, special, bill_id)
page = self.get(url).text
page = lxml.html.fromstring(page)
page.make_links_absolute(url)
page = self.get_page(url)

xpath = (
'//div[contains(@class, "header ")]/following-sibling::*[1]'
Expand Down Expand Up @@ -139,9 +152,7 @@ def parse_bill_versions(self, bill, page):
)

def scrape_amendments(self, bill, link, chamber_pretty):
html = self.get(link).text
page = lxml.html.fromstring(html)
page.make_links_absolute(link)
page = self.get_page(link)

for row in page.xpath('//div[contains(@class, "card shadow")]'):
version_name = "".join(
Expand Down Expand Up @@ -242,32 +253,34 @@ def parse_votes(self, bill, page):
for url in page.xpath(
'//div[contains(text(), "Votes")]/following-sibling::div[1]//a/@href'
):
# the floor rc urls are old now. we need to update the new urls
url = url.strip()
# Floor RC urls have old domain in the website
# We need to update the new urls
if url.startswith(utils.old_base_url):
url = self.update_new_url(url)

# remove duplicates of urls
if url in vote_urls or "roll-calls" not in url:
url = self.fix_url_domain(url)
# Skip the duplicated URLs
if url in vote_urls:
self.logger.debug("Vote URL is duplicated: {}".format(url))
continue
vote_urls.append(url)

bill.add_source(url)
doc = self.get(url).text
doc = lxml.html.fromstring(doc)
doc.make_links_absolute(url)

if "/roll-calls/" in url:
yield from self.parse_chamber_votes(bill, doc, url)
yield from self.parse_chamber_votes(bill, url)
elif "/roll-call-votes/" in url:
yield from self.parse_committee_votes(bill, doc, url)
yield from self.parse_committee_votes(bill, url)
else:
msg = "Unexpected vote url: %r" % url
print(msg)
# raise Exception(msg)
continue
raise Exception(msg)

def get_page(self, url):
html = self.get(url).text
page = lxml.html.fromstring(html)
page.make_links_absolute(url)
return page

def parse_chamber_votes(self, bill, url):
page = self.get_page(url)

def parse_chamber_votes(self, bill, page, url):
chamber = "upper" if "Senate" in page.xpath("string(//h1)") else "lower"
date_str = (
page.xpath(
Expand All @@ -277,7 +290,9 @@ def parse_chamber_votes(self, bill, page, url):
.strip()
)
date_str = re.sub(r"\s+", " ", date_str)
date = datetime.datetime.strptime(date_str, "%A %b %d, %Y %I:%M %p")
date = tz.localize(
datetime.datetime.strptime(date_str, "%A %b %d, %Y %I:%M %p")
)

xpath = 'string(//div[contains(@class,h6)][contains(text(), "Action")]/..)'
motion = page.xpath(xpath).replace("Action", "").strip()
Expand Down Expand Up @@ -320,7 +335,7 @@ def parse_chamber_votes(self, bill, page, url):

vote = VoteEvent(
chamber=chamber,
start_date=tz.localize(date),
start_date=date,
motion_text=motion,
classification=type,
result="pass" if yeas > (nays + other) else "fail",
Expand Down Expand Up @@ -365,7 +380,9 @@ def parse_chamber_votes(self, bill, page, url):

yield vote

def parse_committee_votes(self, bill, doc, url):
def parse_committee_votes(self, bill, url):
doc = self.get_page(url)

chamber = "upper" if "Senate" in doc.xpath("string(//h1)") else "lower"
committee = doc.xpath(
'string(//div[contains(@class, "detailsLabel")][contains(., "Committe")]/following-sibling::div/a)'
Expand All @@ -374,26 +391,26 @@ def parse_committee_votes(self, bill, doc, url):
date = doc.xpath(
'string(//div[contains(@class, "detailsLabel")][contains(., "Date")]/following-sibling::div/a)'
).strip()
date = datetime.datetime.strptime(date, "%B %d, %Y")

date = tz.localize(datetime.datetime.strptime(date, "%B %d, %Y"))
self.logger.info("Committe Vote Date: {}, URL: {}".format(date, url))
# Motion
motion = doc.xpath(
'string(//div[contains(@class, "portlet ")]//div[contains(@class, "h5 ")][contains(., "Motion:")]/span[2])'
'string(//div[contains(text(), "Type of Motion")]/following-sibling::div[1])'
).strip()
motion = "Committee vote (%s): %s" % (committee, motion)

# Roll call
rollcall = self.parse_upper_committee_vote_rollcall(bill, doc)
rollcall = self.parse_upper_committee_vote_rollcall(doc)

vote = VoteEvent(
chamber=chamber,
start_date=tz.localize(date),
start_date=date,
motion_text=motion,
classification=[],
result="pass" if rollcall["passed"] else "fail",
bill=bill,
)
vote.dedupe_key = url
vote.dedupe_key = url + "#" + bill.identifier
vote.set_count("yes", rollcall["yes_count"])
vote.set_count("no", rollcall["no_count"])
vote.set_count("other", rollcall["other_count"])
Expand All @@ -403,9 +420,10 @@ def parse_committee_votes(self, bill, doc, url):
vote.vote(voteval, name)

vote.add_source(url)

yield vote

def parse_upper_committee_vote_rollcall(self, bill, doc):
def parse_upper_committee_vote_rollcall(self, doc):
rollcall = collections.defaultdict(list)

for div in doc.xpath(
Expand All @@ -431,11 +449,11 @@ def parse_upper_committee_vote_rollcall(self, bill, doc):
rollcall[voteval + "_votes"].append(name)

for voteval, xpath in (
("yes", '//ul/li//span[contains(@class, "badge ")][@aria-label="Yea"]'),
("no", '//ul/li//span[contains(@class, "badge ")][@aria-label="Nay"]'),
("yes", '//ul/li//span[contains(@class, "badge")][@title="Yea"]'),
("no", '//ul/li//span[contains(@class, "badge")][@title="Nay"]'),
(
"other",
'//ul/li//span[contains(@class, "badge ")][@aria-label="No Vote"]',
'//ul/li//span[contains(@class, "badge")][@title="No Vote"]',
),
):
count = len(doc.xpath(xpath))
Expand All @@ -451,7 +469,6 @@ def mimetype_from_class(self, link):
"fa-file-pdf": "application/pdf",
"fa-file-word": "application/msword",
}

try:
span = link[0]
except IndexError:
Expand All @@ -461,7 +478,10 @@ def mimetype_from_class(self, link):
mimetype = mimetypes[cls]
return mimetype

def update_new_url(self, url):
def fix_url_domain(self, url):
# Some vote urls have the old domain in the new website
# https://www.legis.state.pa.us/cfdocs/legis/RC/Public/rc_view_action2.cfm
# ?sess_yr=2023&sess_ind=1&rc_body=H&rc_nbr=17
url_query = url.split("?")[1]
url_query_obj = urllib.parse.parse_qs(url_query)
chamber = "house" if url_query_obj["rc_body"][0] == "H" else "senate"
Expand Down
1 change: 0 additions & 1 deletion scrapers/pa/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import datetime
from typing import Literal


old_base_url = "https://www.legis.state.pa.us"
base_url = "https://www.palegis.us"
urls = {
Expand Down

0 comments on commit 1f6867b

Please sign in to comment.