Skip to content

Commit

Permalink
Merge pull request #5088 from openstates/ne-bypass-ssl-error
Browse files Browse the repository at this point in the history
NE: bypass SSL/TLS verification error, fix event location parsing
  • Loading branch information
jessemortenson authored Nov 18, 2024
2 parents 59f35b6 + b836983 commit a6fa3cb
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 9 deletions.
2 changes: 2 additions & 0 deletions scrapers/ne/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,9 @@ class Nebraska(State):
]

def get_session_list(self):
# SSL bad as of 2024-11-18
return url_xpath(
"https://nebraskalegislature.gov/bills/",
"//select[@name='Legislature']/option/text()",
verify=False,
)[:-1]
12 changes: 7 additions & 5 deletions scrapers/ne/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

class NEBillScraper(Scraper, LXMLMixin):
priority_bills = {}
# SSL bad as of 2024-11-18
verify = False

def scrape(self, session=None):
if session is None:
Expand Down Expand Up @@ -41,7 +43,7 @@ def scrape(self, session=None):

def scrape_priorities(self):
priority_url = "https://nebraskalegislature.gov/session/priority.php"
page = self.lxmlize(priority_url)
page = self.lxmlize(priority_url, verify=False)

for row in page.xpath(
"//table[@id='committee_bill_results' or @id='senator_bill_results' or @id='speaker_bill_results']/tr"
Expand All @@ -60,7 +62,7 @@ def scrape_special(self, session: str, start_date: datetime):
"https://nebraskalegislature.gov/bills/search_by_date.php?"
"SessionDay={}&special=1".format(start_date.strftime("%Y"))
)
page = self.lxmlize(main_url)
page = self.lxmlize(main_url, verify=False)

document_links = self.get_nodes(
page,
Expand All @@ -83,7 +85,7 @@ def scrape_year(self, session, year):
"https://nebraskalegislature.gov/bills/search_by_date.php?"
"SessionDay={}".format(year)
)
page = self.lxmlize(main_url)
page = self.lxmlize(main_url, verify=False)

document_links = self.get_nodes(
page,
Expand All @@ -106,7 +108,7 @@ def scrape_year(self, session, year):
yield from self.bill_info(bill_link, session, main_url)

def bill_info(self, bill_link, session, main_url):
bill_page = self.lxmlize(bill_link)
bill_page = self.lxmlize(bill_link, verify=False)

long_title = self.get_node(
bill_page, '//div[@class="main-content"]//h2'
Expand Down Expand Up @@ -291,7 +293,7 @@ def scrape_votes(self, bill, bill_page, chamber):
date_td, motion_td, *_ = vote_link.xpath("ancestor::tr/td")
date = datetime.strptime(date_td.text, "%b %d, %Y")
motion_text = motion_td.text_content()
vote_page = self.lxmlize(vote_url)
vote_page = self.lxmlize(vote_url, verify=False)
passed = "Passed" in motion_text or "Advanced" in motion_text

vote = VoteEvent(
Expand Down
9 changes: 7 additions & 2 deletions scrapers/ne/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

class NEEventScraper(Scraper):
_tz = pytz.timezone("US/Central")
# SSL bad as of 2024-11-18
verify = False

# usage: PYTHONPATH=scrapers poetry run os-update ne
# events --scrape start=2020-02-02 end=2020-03-02
Expand Down Expand Up @@ -43,7 +45,7 @@ def scrape(self, start=None, end=None):
"endDay": end.strftime("%d"),
"endYear": end.strftime("%Y"),
}
page = self.post(LIST_URL, args).content
page = self.post(LIST_URL, args, verify=False).content

yield from self.scrape_events(page)

Expand All @@ -66,7 +68,10 @@ def scrape_events(self, page):
'div[contains(@class, "card-header")]/small/text()'
)[0].strip()

(location, time) = details.split(" - ")
# (location, time)
location_time_parts = details.split(" - ")
location = " - ".join(location_time_parts[:-1])
time = location_time_parts[-1]

# turn room numbers into the full address
if location.lower().startswith("room"):
Expand Down
4 changes: 2 additions & 2 deletions scrapers/utils/lxmlize.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def url_xpath(url, path, verify=True, user_agent=None):
class LXMLMixin(object):
"""Mixin for adding LXML helper functions to Open States code."""

def lxmlize(self, url, raise_exceptions=False):
def lxmlize(self, url, raise_exceptions=False, verify=True):
"""Parses document into an LXML object and makes links absolute.
Args:
Expand All @@ -32,7 +32,7 @@ def lxmlize(self, url, raise_exceptions=False):
try:
# This class is always mixed into subclasses of `Scraper`,
# which have a `get` method defined.
response = self.get(url)
response = self.get(url, verify=verify)
except requests.exceptions.SSLError:
self.warning(
"`self.lxmlize()` failed due to SSL error, trying "
Expand Down

0 comments on commit a6fa3cb

Please sign in to comment.