Merge pull request #5088 from openstates/ne-bypass-ssl-error

NE: bypass SSL/TLS verification error, fix event location parsing
openstates · Nov 18, 2024 · a6fa3cb · a6fa3cb
2 parents 59f35b6 + b836983
commit a6fa3cb
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 9 deletions.
diff --git a/scrapers/ne/__init__.py b/scrapers/ne/__init__.py
@@ -104,7 +104,9 @@ class Nebraska(State):
     ]
 
     def get_session_list(self):
+        # SSL bad as of 2024-11-18
         return url_xpath(
             "https://nebraskalegislature.gov/bills/",
             "//select[@name='Legislature']/option/text()",
+            verify=False,
         )[:-1]
diff --git a/scrapers/ne/bills.py b/scrapers/ne/bills.py
@@ -13,6 +13,8 @@
 
 class NEBillScraper(Scraper, LXMLMixin):
     priority_bills = {}
+    # SSL bad as of 2024-11-18
+    verify = False
 
     def scrape(self, session=None):
         if session is None:
@@ -41,7 +43,7 @@ def scrape(self, session=None):
 
     def scrape_priorities(self):
         priority_url = "https://nebraskalegislature.gov/session/priority.php"
-        page = self.lxmlize(priority_url)
+        page = self.lxmlize(priority_url, verify=False)
 
         for row in page.xpath(
             "//table[@id='committee_bill_results' or @id='senator_bill_results' or @id='speaker_bill_results']/tr"
@@ -60,7 +62,7 @@ def scrape_special(self, session: str, start_date: datetime):
             "https://nebraskalegislature.gov/bills/search_by_date.php?"
             "SessionDay={}&special=1".format(start_date.strftime("%Y"))
         )
-        page = self.lxmlize(main_url)
+        page = self.lxmlize(main_url, verify=False)
 
         document_links = self.get_nodes(
             page,
@@ -83,7 +85,7 @@ def scrape_year(self, session, year):
             "https://nebraskalegislature.gov/bills/search_by_date.php?"
             "SessionDay={}".format(year)
         )
-        page = self.lxmlize(main_url)
+        page = self.lxmlize(main_url, verify=False)
 
         document_links = self.get_nodes(
             page,
@@ -106,7 +108,7 @@ def scrape_year(self, session, year):
             yield from self.bill_info(bill_link, session, main_url)
 
     def bill_info(self, bill_link, session, main_url):
-        bill_page = self.lxmlize(bill_link)
+        bill_page = self.lxmlize(bill_link, verify=False)
 
         long_title = self.get_node(
             bill_page, '//div[@class="main-content"]//h2'
@@ -291,7 +293,7 @@ def scrape_votes(self, bill, bill_page, chamber):
             date_td, motion_td, *_ = vote_link.xpath("ancestor::tr/td")
             date = datetime.strptime(date_td.text, "%b %d, %Y")
             motion_text = motion_td.text_content()
-            vote_page = self.lxmlize(vote_url)
+            vote_page = self.lxmlize(vote_url, verify=False)
             passed = "Passed" in motion_text or "Advanced" in motion_text
 
             vote = VoteEvent(

diff --git a/scrapers/ne/events.py b/scrapers/ne/events.py
@@ -16,6 +16,8 @@
 
 class NEEventScraper(Scraper):
     _tz = pytz.timezone("US/Central")
+    # SSL bad as of 2024-11-18
+    verify = False
 
     # usage: PYTHONPATH=scrapers poetry run os-update ne
     # events --scrape start=2020-02-02 end=2020-03-02
@@ -43,7 +45,7 @@ def scrape(self, start=None, end=None):
             "endDay": end.strftime("%d"),
             "endYear": end.strftime("%Y"),
         }
-        page = self.post(LIST_URL, args).content
+        page = self.post(LIST_URL, args, verify=False).content
 
         yield from self.scrape_events(page)
 
@@ -66,7 +68,10 @@ def scrape_events(self, page):
                 'div[contains(@class, "card-header")]/small/text()'
             )[0].strip()
 
-            (location, time) = details.split(" - ")
+            # (location, time)
+            location_time_parts = details.split(" - ")
+            location = " - ".join(location_time_parts[:-1])
+            time = location_time_parts[-1]
 
             # turn room numbers into the full address
             if location.lower().startswith("room"):

diff --git a/scrapers/utils/lxmlize.py b/scrapers/utils/lxmlize.py
@@ -21,7 +21,7 @@ def url_xpath(url, path, verify=True, user_agent=None):
 class LXMLMixin(object):
     """Mixin for adding LXML helper functions to Open States code."""
 
-    def lxmlize(self, url, raise_exceptions=False):
+    def lxmlize(self, url, raise_exceptions=False, verify=True):
         """Parses document into an LXML object and makes links absolute.
 
         Args:
@@ -32,7 +32,7 @@ def lxmlize(self, url, raise_exceptions=False):
         try:
             # This class is always mixed into subclasses of `Scraper`,
             # which have a `get` method defined.
-            response = self.get(url)
+            response = self.get(url, verify=verify)
         except requests.exceptions.SSLError:
             self.warning(
                 "`self.lxmlize()` failed due to SSL error, trying "