From 23d1313266cd6d369dd82bc0834b78ca3cf7ebfa Mon Sep 17 00:00:00 2001
From: braykuka <braykuka@gmail.com>
Date: Tue, 27 Feb 2024 19:43:00 +0100
Subject: [PATCH] Fix: issue

---
 scrapers/ca/events_web.py | 64 +++++++++++++++++++++++++++++----------
 1 file changed, 48 insertions(+), 16 deletions(-)

diff --git a/scrapers/ca/events_web.py b/scrapers/ca/events_web.py
index d8e09775f8..11fdeba1d9 100644
--- a/scrapers/ca/events_web.py
+++ b/scrapers/ca/events_web.py
@@ -8,7 +8,7 @@
 import requests
 
 
-strip_chars = ",\t\n\r "
+strip_chars = ".,\t\n\r "
 
 
 class CAEventWebScraper(Scraper, LXMLMixin):
@@ -89,8 +89,8 @@ def scrape_upper(self, start, end):
 
                 when = (
                     " ".join([hearing_date, hearing_time])
-                    .replace("or upon adjournment of Session", "")
-                    .replace("and upon adjournment of Session, if necessary", "")
+                    .split("or")[0]
+                    .split("and")[0]
                     .strip()
                 )
                 when = dateutil.parser.parse(when)
@@ -126,14 +126,21 @@ def scrape_upper_agenda(self, event, committees, url):
         response = self.get(url).json()
         page = lxml.html.fromstring(response["agenda"])
         page.make_links_absolute(url)
+        start = False
 
-        for span in page.xpath(
-            './/span[@class="CommitteeTopic "]/span[@class="HearingTopic"]/following-sibling::span'
-        ):
+        for span in page.xpath('.//span[@class="CommitteeTopic "]/span'):
             span_class = span.xpath("@class")[0].strip(strip_chars)
             span_title = span.xpath("string()").strip(strip_chars)
             span_title = re.sub(r"\s+", " ", span_title)
             span_title = re.sub(r"^\d+", "", span_title)
+            span_title = span_title.replace("SUBJECT:", "").strip(strip_chars)
+
+            if "linesep" in span_class:
+                start = True
+            if not start:
+                continue
+            if "HearingTopic " in span_class:
+                continue
             if not span_title:
                 continue
             agenda = event.add_agenda_item(span_title)
@@ -155,18 +162,18 @@ def scrape_upper_agenda(self, event, committees, url):
                 )
                 agenda.add_person(appointee_name, note=appointee_position)
 
-            elif "Measure " in span_class:
+            elif "Measure row" in span_class:
                 bill_id = (
                     span.xpath('.//a[contains(@class, "MeasureLink")]')[0]
                     .xpath("string()")
                     .replace("No", "")
                     .replace(".", "")
-                    .replace("  ", " ")
+                    .replace(" ", "")
                     .strip(strip_chars)
                 )
                 note = (
-                    span.xpath('.//span[contains(@class, "Topic")]')[0]
-                    .xpath("string()")
+                    " ".join(span.xpath('.//span[contains(@class, "Topic")]//text()'))
+                    .strip(strip_chars)
                     .strip(strip_chars)
                 )
                 agenda.add_bill(bill_id, note=note)
@@ -180,6 +187,7 @@ def scrape_lower(self):
 
         for date_row in page.xpath("//h5[@class='date']"):
             hearing_date = date_row.xpath("string()").strip()
+
             for content_xpath in date_row.xpath(
                 './following-sibling::div[@class="wrapper--border"][1]/div[@class="dailyfile-section-item"]'
             ):
@@ -214,8 +222,8 @@ def scrape_lower(self):
 
                 when = (
                     " ".join([hearing_date, hearing_time])
-                    .replace("or upon adjournment of Session", "")
-                    .replace("and upon adjournment of Session, if necessary", "")
+                    .split("or")[0]
+                    .split("and")[0]
                     .strip()
                 )
                 when = dateutil.parser.parse(when)
@@ -248,13 +256,37 @@ def scrape_lower(self):
                 yield event
 
     def scrape_lower_agenda(self, event, committees, page):
-        for span in page.xpath(
-            './/span[@class="CommitteeTopic"]/span[@class="HearingTopic"]/following-sibling::span'
-        ):
-            span_title = span.xpath("string()").strip(strip_chars)
+        start = False
+
+        for span in page.xpath('.//span[@class="CommitteeTopic"]/span'):
+            span_class = span.xpath("@class")[0].strip(strip_chars)
+            span_title = (" ".join(span.xpath(".//text()"))).strip(strip_chars)
             span_title = re.sub(r"\s+", " ", span_title)
             span_title = re.sub(r"^\d+", "", span_title)
+            span_title = span_title.replace("SUBJECT:", "").strip(strip_chars)
+
+            if "linesp" in span_class or "HearingTopic" in span_class:
+                start = True
+            if not start:
+                continue
+            if "HearingTopic" in span_class or "Header" in span_class:
+                continue
+
             if span_title:
                 agenda = event.add_agenda_item(span_title)
                 for committee in committees:
                     agenda.add_committee(committee, note="host")
+
+            if "Measure" == span_class:
+                bill_id = (
+                    span.xpath(".//a")[0]
+                    .xpath("string()")
+                    .replace("No", "")
+                    .replace(".", "")
+                    .replace(" ", "")
+                    .strip(strip_chars)
+                )
+                note = " ".join(
+                    span.xpath('.//span[contains(@class, "Topic")]//text()')
+                ).strip(strip_chars)
+                agenda.add_bill(bill_id, note=note)