Merge pull request #4855 from braykuka/1148-ca-new-events-scraper-tha…

…t-targets-websites CA: Events: Missing Bill IDs
openstates · Mar 1, 2024 · ce67596 · ce67596
2 parents ce19d9a + 7c29619
commit ce67596
Showing 1 changed file with 55 additions and 19 deletions.
diff --git a/scrapers/ca/events_web.py b/scrapers/ca/events_web.py
@@ -8,7 +8,7 @@
 import requests
 
 
-strip_chars = ",\t\n\r "
+strip_chars = ".,\t\n\r "
 
 
 class CAEventWebScraper(Scraper, LXMLMixin):
@@ -73,7 +73,7 @@ def scrape_upper(self, start, end):
                 time_loc = [
                     row
                     for row in panel_content.split("\n")
-                    if "p.m." in row or "pm" in row or "a.m." in row or " - " in row
+                    if "p.m." in row or "a.m." in row or " - " in row
                 ]
                 time_loc = "".join(time_loc)
 
@@ -87,9 +87,11 @@ def scrape_upper(self, start, end):
                 hearing_location = " ".join(time_loc_parts[1:])
                 hearing_location = hearing_location.strip(strip_chars)
 
-                when = " ".join([hearing_date, hearing_time]).strip()
-                when = re.sub(
-                    r"(or|and) Upon Adjournment(.*)", "", when, flags=re.IGNORECASE
+                when = (
+                    " ".join([hearing_date, hearing_time])
+                    .split("or")[0]
+                    .split("and")[0]
+                    .strip()
                 )
                 when = dateutil.parser.parse(when)
                 when = self._tz.localize(when)
@@ -124,14 +126,21 @@ def scrape_upper_agenda(self, event, committees, url):
         response = self.get(url).json()
         page = lxml.html.fromstring(response["agenda"])
         page.make_links_absolute(url)
+        start = False
 
-        for span in page.xpath(
-            './/span[@class="CommitteeTopic "]/span[@class="HearingTopic"]/following-sibling::span'
-        ):
+        for span in page.xpath('.//span[@class="CommitteeTopic "]/span'):
             span_class = span.xpath("@class")[0].strip(strip_chars)
             span_title = span.xpath("string()").strip(strip_chars)
             span_title = re.sub(r"\s+", " ", span_title)
             span_title = re.sub(r"^\d+", "", span_title)
+            span_title = span_title.replace("SUBJECT:", "").strip(strip_chars)
+
+            if "linesep" in span_class:
+                start = True
+            if not start:
+                continue
+            if "HearingTopic " in span_class:
+                continue
             if not span_title:
                 continue
             agenda = event.add_agenda_item(span_title)
@@ -153,18 +162,18 @@ def scrape_upper_agenda(self, event, committees, url):
                 )
                 agenda.add_person(appointee_name, note=appointee_position)
 
-            elif "Measure " in span_class:
+            elif "Measure row" in span_class:
                 bill_id = (
                     span.xpath('.//a[contains(@class, "MeasureLink")]')[0]
                     .xpath("string()")
                     .replace("No", "")
                     .replace(".", "")
-                    .replace("  ", " ")
+                    .replace(" ", "")
                     .strip(strip_chars)
                 )
                 note = (
-                    span.xpath('.//span[contains(@class, "Topic")]')[0]
-                    .xpath("string()")
+                    " ".join(span.xpath('.//span[contains(@class, "Topic")]//text()'))
+                    .strip(strip_chars)
                     .strip(strip_chars)
                 )
                 agenda.add_bill(bill_id, note=note)
@@ -178,6 +187,7 @@ def scrape_lower(self):
 
         for date_row in page.xpath("//h5[@class='date']"):
             hearing_date = date_row.xpath("string()").strip()
+
             for content_xpath in date_row.xpath(
                 './following-sibling::div[@class="wrapper--border"][1]/div[@class="dailyfile-section-item"]'
             ):
@@ -210,9 +220,11 @@ def scrape_lower(self):
                 )
                 hearing_location = hearing_location.strip(strip_chars)
 
-                when = " ".join([hearing_date, hearing_time]).strip()
-                when = re.sub(
-                    r"(or|and) Upon Adjournment(.*)", "", when, flags=re.IGNORECASE
+                when = (
+                    " ".join([hearing_date, hearing_time])
+                    .split("or")[0]
+                    .split("and")[0]
+                    .strip()
                 )
                 when = dateutil.parser.parse(when)
                 when = self._tz.localize(when)
@@ -244,13 +256,37 @@ def scrape_lower(self):
                 yield event
 
     def scrape_lower_agenda(self, event, committees, page):
-        for span in page.xpath(
-            './/span[@class="CommitteeTopic"]/span[@class="HearingTopic"]/following-sibling::span'
-        ):
-            span_title = span.xpath("string()").strip(strip_chars)
+        start = False
+
+        for span in page.xpath('.//span[@class="CommitteeTopic"]/span'):
+            span_class = span.xpath("@class")[0].strip(strip_chars)
+            span_title = (" ".join(span.xpath(".//text()"))).strip(strip_chars)
             span_title = re.sub(r"\s+", " ", span_title)
             span_title = re.sub(r"^\d+", "", span_title)
+            span_title = span_title.replace("SUBJECT:", "").strip(strip_chars)
+
+            if "linesp" in span_class or "HearingTopic" in span_class:
+                start = True
+            if not start:
+                continue
+            if "HearingTopic" in span_class or "Header" in span_class:
+                continue
+
             if span_title:
                 agenda = event.add_agenda_item(span_title)
                 for committee in committees:
                     agenda.add_committee(committee, note="host")
+
+            if "Measure" == span_class:
+                bill_id = (
+                    span.xpath(".//a")[0]
+                    .xpath("string()")
+                    .replace("No", "")
+                    .replace(".", "")
+                    .replace(" ", "")
+                    .strip(strip_chars)
+                )
+                note = " ".join(
+                    span.xpath('.//span[contains(@class, "Topic")]//text()')
+                ).strip(strip_chars)
+                agenda.add_bill(bill_id, note=note)