diff --git a/scrapers/ca/events_web.py b/scrapers/ca/events_web.py index aadd39e0da..11fdeba1d9 100644 --- a/scrapers/ca/events_web.py +++ b/scrapers/ca/events_web.py @@ -8,7 +8,7 @@ import requests -strip_chars = ",\t\n\r " +strip_chars = ".,\t\n\r " class CAEventWebScraper(Scraper, LXMLMixin): @@ -73,7 +73,7 @@ def scrape_upper(self, start, end): time_loc = [ row for row in panel_content.split("\n") - if "p.m." in row or "pm" in row or "a.m." in row or " - " in row + if "p.m." in row or "a.m." in row or " - " in row ] time_loc = "".join(time_loc) @@ -87,9 +87,11 @@ def scrape_upper(self, start, end): hearing_location = " ".join(time_loc_parts[1:]) hearing_location = hearing_location.strip(strip_chars) - when = " ".join([hearing_date, hearing_time]).strip() - when = re.sub( - r"(or|and) Upon Adjournment(.*)", "", when, flags=re.IGNORECASE + when = ( + " ".join([hearing_date, hearing_time]) + .split("or")[0] + .split("and")[0] + .strip() ) when = dateutil.parser.parse(when) when = self._tz.localize(when) @@ -124,14 +126,21 @@ def scrape_upper_agenda(self, event, committees, url): response = self.get(url).json() page = lxml.html.fromstring(response["agenda"]) page.make_links_absolute(url) + start = False - for span in page.xpath( - './/span[@class="CommitteeTopic "]/span[@class="HearingTopic"]/following-sibling::span' - ): + for span in page.xpath('.//span[@class="CommitteeTopic "]/span'): span_class = span.xpath("@class")[0].strip(strip_chars) span_title = span.xpath("string()").strip(strip_chars) span_title = re.sub(r"\s+", " ", span_title) span_title = re.sub(r"^\d+", "", span_title) + span_title = span_title.replace("SUBJECT:", "").strip(strip_chars) + + if "linesep" in span_class: + start = True + if not start: + continue + if "HearingTopic " in span_class: + continue if not span_title: continue agenda = event.add_agenda_item(span_title) @@ -153,18 +162,18 @@ def scrape_upper_agenda(self, event, committees, url): ) agenda.add_person(appointee_name, note=appointee_position) - elif "Measure " in span_class: + elif "Measure row" in span_class: bill_id = ( span.xpath('.//a[contains(@class, "MeasureLink")]')[0] .xpath("string()") .replace("No", "") .replace(".", "") - .replace(" ", " ") + .replace(" ", "") .strip(strip_chars) ) note = ( - span.xpath('.//span[contains(@class, "Topic")]')[0] - .xpath("string()") + " ".join(span.xpath('.//span[contains(@class, "Topic")]//text()')) + .strip(strip_chars) .strip(strip_chars) ) agenda.add_bill(bill_id, note=note) @@ -178,6 +187,7 @@ def scrape_lower(self): for date_row in page.xpath("//h5[@class='date']"): hearing_date = date_row.xpath("string()").strip() + for content_xpath in date_row.xpath( './following-sibling::div[@class="wrapper--border"][1]/div[@class="dailyfile-section-item"]' ): @@ -210,9 +220,11 @@ def scrape_lower(self): ) hearing_location = hearing_location.strip(strip_chars) - when = " ".join([hearing_date, hearing_time]).strip() - when = re.sub( - r"(or|and) Upon Adjournment(.*)", "", when, flags=re.IGNORECASE + when = ( + " ".join([hearing_date, hearing_time]) + .split("or")[0] + .split("and")[0] + .strip() ) when = dateutil.parser.parse(when) when = self._tz.localize(when) @@ -244,13 +256,37 @@ def scrape_lower(self): yield event def scrape_lower_agenda(self, event, committees, page): - for span in page.xpath( - './/span[@class="CommitteeTopic"]/span[@class="HearingTopic"]/following-sibling::span' - ): - span_title = span.xpath("string()").strip(strip_chars) + start = False + + for span in page.xpath('.//span[@class="CommitteeTopic"]/span'): + span_class = span.xpath("@class")[0].strip(strip_chars) + span_title = (" ".join(span.xpath(".//text()"))).strip(strip_chars) span_title = re.sub(r"\s+", " ", span_title) span_title = re.sub(r"^\d+", "", span_title) + span_title = span_title.replace("SUBJECT:", "").strip(strip_chars) + + if "linesp" in span_class or "HearingTopic" in span_class: + start = True + if not start: + continue + if "HearingTopic" in span_class or "Header" in span_class: + continue + if span_title: agenda = event.add_agenda_item(span_title) for committee in committees: agenda.add_committee(committee, note="host") + + if "Measure" == span_class: + bill_id = ( + span.xpath(".//a")[0] + .xpath("string()") + .replace("No", "") + .replace(".", "") + .replace(" ", "") + .strip(strip_chars) + ) + note = " ".join( + span.xpath('.//span[contains(@class, "Topic")]//text()') + ).strip(strip_chars) + agenda.add_bill(bill_id, note=note)