Skip to content

Commit

Permalink
Merge pull request #4855 from braykuka/1148-ca-new-events-scraper-tha…
Browse files Browse the repository at this point in the history
…t-targets-websites

CA: Events: Missing Bill IDs
  • Loading branch information
NewAgeAirbender authored Mar 1, 2024
2 parents ce19d9a + 7c29619 commit ce67596
Showing 1 changed file with 55 additions and 19 deletions.
74 changes: 55 additions & 19 deletions scrapers/ca/events_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import requests


strip_chars = ",\t\n\r "
strip_chars = ".,\t\n\r "


class CAEventWebScraper(Scraper, LXMLMixin):
Expand Down Expand Up @@ -73,7 +73,7 @@ def scrape_upper(self, start, end):
time_loc = [
row
for row in panel_content.split("\n")
if "p.m." in row or "pm" in row or "a.m." in row or " - " in row
if "p.m." in row or "a.m." in row or " - " in row
]
time_loc = "".join(time_loc)

Expand All @@ -87,9 +87,11 @@ def scrape_upper(self, start, end):
hearing_location = " ".join(time_loc_parts[1:])
hearing_location = hearing_location.strip(strip_chars)

when = " ".join([hearing_date, hearing_time]).strip()
when = re.sub(
r"(or|and) Upon Adjournment(.*)", "", when, flags=re.IGNORECASE
when = (
" ".join([hearing_date, hearing_time])
.split("or")[0]
.split("and")[0]
.strip()
)
when = dateutil.parser.parse(when)
when = self._tz.localize(when)
Expand Down Expand Up @@ -124,14 +126,21 @@ def scrape_upper_agenda(self, event, committees, url):
response = self.get(url).json()
page = lxml.html.fromstring(response["agenda"])
page.make_links_absolute(url)
start = False

for span in page.xpath(
'.//span[@class="CommitteeTopic "]/span[@class="HearingTopic"]/following-sibling::span'
):
for span in page.xpath('.//span[@class="CommitteeTopic "]/span'):
span_class = span.xpath("@class")[0].strip(strip_chars)
span_title = span.xpath("string()").strip(strip_chars)
span_title = re.sub(r"\s+", " ", span_title)
span_title = re.sub(r"^\d+", "", span_title)
span_title = span_title.replace("SUBJECT:", "").strip(strip_chars)

if "linesep" in span_class:
start = True
if not start:
continue
if "HearingTopic " in span_class:
continue
if not span_title:
continue
agenda = event.add_agenda_item(span_title)
Expand All @@ -153,18 +162,18 @@ def scrape_upper_agenda(self, event, committees, url):
)
agenda.add_person(appointee_name, note=appointee_position)

elif "Measure " in span_class:
elif "Measure row" in span_class:
bill_id = (
span.xpath('.//a[contains(@class, "MeasureLink")]')[0]
.xpath("string()")
.replace("No", "")
.replace(".", "")
.replace(" ", " ")
.replace(" ", "")
.strip(strip_chars)
)
note = (
span.xpath('.//span[contains(@class, "Topic")]')[0]
.xpath("string()")
" ".join(span.xpath('.//span[contains(@class, "Topic")]//text()'))
.strip(strip_chars)
.strip(strip_chars)
)
agenda.add_bill(bill_id, note=note)
Expand All @@ -178,6 +187,7 @@ def scrape_lower(self):

for date_row in page.xpath("//h5[@class='date']"):
hearing_date = date_row.xpath("string()").strip()

for content_xpath in date_row.xpath(
'./following-sibling::div[@class="wrapper--border"][1]/div[@class="dailyfile-section-item"]'
):
Expand Down Expand Up @@ -210,9 +220,11 @@ def scrape_lower(self):
)
hearing_location = hearing_location.strip(strip_chars)

when = " ".join([hearing_date, hearing_time]).strip()
when = re.sub(
r"(or|and) Upon Adjournment(.*)", "", when, flags=re.IGNORECASE
when = (
" ".join([hearing_date, hearing_time])
.split("or")[0]
.split("and")[0]
.strip()
)
when = dateutil.parser.parse(when)
when = self._tz.localize(when)
Expand Down Expand Up @@ -244,13 +256,37 @@ def scrape_lower(self):
yield event

def scrape_lower_agenda(self, event, committees, page):
for span in page.xpath(
'.//span[@class="CommitteeTopic"]/span[@class="HearingTopic"]/following-sibling::span'
):
span_title = span.xpath("string()").strip(strip_chars)
start = False

for span in page.xpath('.//span[@class="CommitteeTopic"]/span'):
span_class = span.xpath("@class")[0].strip(strip_chars)
span_title = (" ".join(span.xpath(".//text()"))).strip(strip_chars)
span_title = re.sub(r"\s+", " ", span_title)
span_title = re.sub(r"^\d+", "", span_title)
span_title = span_title.replace("SUBJECT:", "").strip(strip_chars)

if "linesp" in span_class or "HearingTopic" in span_class:
start = True
if not start:
continue
if "HearingTopic" in span_class or "Header" in span_class:
continue

if span_title:
agenda = event.add_agenda_item(span_title)
for committee in committees:
agenda.add_committee(committee, note="host")

if "Measure" == span_class:
bill_id = (
span.xpath(".//a")[0]
.xpath("string()")
.replace("No", "")
.replace(".", "")
.replace(" ", "")
.strip(strip_chars)
)
note = " ".join(
span.xpath('.//span[contains(@class, "Topic")]//text()')
).strip(strip_chars)
agenda.add_bill(bill_id, note=note)

0 comments on commit ce67596

Please sign in to comment.