Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HI: fix bugs in bill version selection, correct header version URL #5099

Merged
merged 2 commits into from
Nov 26, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 30 additions & 14 deletions scrapers/hi/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def _sponsor_interceptor(line):
interceptors = {"Introducer(s)": _sponsor_interceptor}

ret = {}
for tr in metainf_table:
for tr in metainf_table.cssselect("tr"):
row = tr.xpath("td")
key = row[0].text_content().strip()
value = row[1].text_content().strip()
Expand Down Expand Up @@ -161,9 +161,26 @@ def clean_voter_name(self, name):
name = name[:-1]
return name.strip()

def parse_bill_versions_table(self, bill, versions):
if not versions:
self.logger.warning("No version table for {}".format(bill.identifier))
def parse_bill_versions_table(self, bill, bill_page):
no_versions_warnings = bill_page.xpath(
"//*[contains(@id, 'MainContent_UpdatePanel2')]"
"//span[contains(text(),'You may search in our Document Directories')]"
)
if len(no_versions_warnings) == 1:
# Text on the page indicates there are no versions for this bill, which happens once in a while
self.logger.info(
"No bill versions posted yet for {}".format(bill.identifier)
)
return
else:
versions = bill_page.xpath(
"//*[contains(@id, 'MainContent_UpdatePanel2')]//a/img/../.."
)
if len(versions) == 0:
self.logger.warning(
"Failed to select bill versions for {}".format(bill.identifier)
)
return

for version in versions:
td = version.xpath("./a")[0]
Expand Down Expand Up @@ -248,17 +265,13 @@ def scrape_bill(self, session, chamber, bill_type, url):

qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query))
bill_id = "{}{}".format(qs["billtype"], qs["billnumber"])
versions = bill_page.xpath(
"//*[@id='ctl00_MainContent_UpdatePanel2']/div/div/div"
)

try:
metainf_table = bill_page.xpath(
'//div[contains(@id, "itemPlaceholder")]//table[1]'
)[0]
except IndexError:
self.error("Missing Metainf table")
self.error(bill_html)
self.error(f"Missing Metainf table on {url}")
return

action_table = bill_page.xpath(
Expand Down Expand Up @@ -332,11 +345,14 @@ def scrape_bill(self, session, chamber, bill_type, url):
if "gm" in bill_id.lower():
b.add_sponsorship("governor", "primary", "person", True)

self.parse_bill_versions_table(b, versions)
self.parse_bill_versions_table(b, bill_page)
self.parse_testimony(b, bill_page)
self.parse_cmte_reports(b, bill_page)

if bill_page.xpath("//input[@id='MainContent_ImageButtonPDF']"):
if (
bill_page.xpath("//input[@id='MainContent_ImageButtonPDF']")
and len(b.versions) == 0
):
self.parse_bill_header_versions(b, bill_id, session, bill_page)

current_referral = meta["Current Referral"].strip()
Expand All @@ -355,10 +371,10 @@ def scrape_bill(self, session, chamber, bill_type, url):

# sometimes they link to a version that's only in the header,
# and works via a form submit, so hardcode it here
# jessemortenson: not sure that this condition still occurs
# couldn't find evidence of it in late 2024 session
def parse_bill_header_versions(self, bill, bill_id, session, page):
pdf_link = (
f"https://capitol.hawaii.gov/session{session[0:4]}/bills/{bill_id}_.PDF"
)
pdf_link = f"https://capitol.hawaii.gov/session/session{session[0:4]}/bills/{bill_id}_.PDF"
bill.add_version_link(
bill_id,
pdf_link,
Expand Down