Skip to content

Commit

Permalink
Merge pull request #1129 from City-Bureau/fix-ssa
Browse files Browse the repository at this point in the history
🕷️ Fix spider: Chicago Department of Public Health
  • Loading branch information
SimmonsRitchie authored Jul 24, 2024
2 parents 6f054cf + c0b31d7 commit 3f47ca3
Show file tree
Hide file tree
Showing 4 changed files with 3,905 additions and 817 deletions.
170 changes: 57 additions & 113 deletions city_scrapers/spiders/chi_pubhealth.py
Original file line number Diff line number Diff line change
@@ -1,141 +1,85 @@
import re
from datetime import datetime
from urllib.parse import urljoin, urlparse

from city_scrapers_core.constants import BOARD
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider
from dateutil.parser import parse as dateparse


class ChiPubHealthSpider(CityScrapersSpider):
name = "chi_pubhealth"
agency = "Chicago Department of Public Health"
timezone = "America/Chicago"
start_urls = ["https://www.chicago.gov/city/en/depts/cdph/supp_info.html"]
start_time = "9:00 am"
end_time = "10:30 am"
location = {
"name": "DePaul Center",
"address": "333 S State St, 2nd Floor, Room 200",
}

@property
def start_urls(self):
def parse(self, response):
"""
DPH generally uses a standard URL format, but sometimes deviates from
the pattern. This property inserts the current year into the standard
format, as well as known variants, in hopes DPH sticks to one of their
conventions and this scraper does not need to be updated annually.
Loop over each link to meeting pages for each year
and then parse the meeting page.
"""
standard_url = "https://www.chicago.gov/city/en/depts/cdph/supp_info/boh/{}-board-of-health-meetings.html" # noqa
url_variant_1 = "https://www.chicago.gov/city/en/depts/cdph/supp_info/boh/{}-board-of-health.html" # noqa

# current_year = 2021
current_year = datetime.now().year

return [
standard_url.format(current_year),
url_variant_1.format(current_year),
]
for link in response.css(".list-group-item.list-group-item-action a"):
text = link.css("::text").extract_first()
if "board of health meetings" in text.lower():
url = link.css("::attr(href)").extract_first()
yield response.follow(url, callback=self._parse_year_page)

def parse(self, response):
def _parse_year_page(self, response):
"""
`parse` should always `yield` Meeting items.
Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
needs.
Parse the year page and return the list of meetings,
which appear to be monthly.
"""
title = response.xpath('//h1[@class="page-heading"]/text()').extract_first()

# Extract year and meeting name from title like "2017 Board of Health Meetings"
year_match = re.match(r"\d{4}", title)
self.year = int(year_match.group())

# The description and meeting dates are a series of p elements
for idx, item in enumerate(response.css(".page-description-above p"), start=1):
if idx == 1:
# inspect_response(response, self)
# Description is the first p element
description = item.xpath("text()").getall()
# description = item.xpath("text()").extract_first()
if "333 S" not in description[1]:
raise ValueError(description)
continue

# Skip empty rows
if not item.css("*::text").extract_first().strip():
page_title = response.css("h1.page-heading::text").extract_first()
year = re.search(r"\d{4}", page_title).group()
base_url = self._get_base_url(response)
for item in response.css(".accordion-item.card.section"):
links = self._parse_links(item, base_url)
# if there are no links then assume that month's meeting
# has been cancelled
if not links:
continue

start = self._parse_start(item)
if start is None:
continue

start, end = self._parse_dates(item, year)
meeting = Meeting(
title="Board of Health",
title="Board of Health meeting",
description="",
classification=BOARD,
start=start,
end=None,
time_notes="",
end=end,
all_day=False,
location={
"name": "2nd Floor Board Room, DePaul Center",
"address": "333 S State St, Chicago, IL 60604",
},
links=self._parse_links(item, response),
time_notes="",
location=self.location,
links=links,
source=response.url,
)
meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)
meeting["status"] = self._get_status(
meeting, text=" ".join(item.css("*::text").extract())
)
yield meeting

def _parse_date(self, item):
"""
Parse the meeting date.
"""
# Future meetings are plain text
date_text = (item.xpath("text()").extract_first() or "").strip()

if not date_text:
# Past meetings are links to the agenda
date_text = item.xpath("a/text()").extract_first()
if date_text is None:
return None
# Remove extra whitespace characters
date_text = re.sub(r"\s+", " ", str(date_text)).strip()

# Handle typos like "December18"
if re.match(r"[a-zA-Z]+\d+", str(date_text)):
date_match = re.search(r"(?P<month>[a-zA-Z]+)(?P<day>\d+)", date_text)
date_text = "{} {}".format(
date_match.group("month"), date_match.group("day")
)
# Extract date formatted like "January 12"

return datetime.strptime(date_text, "%B %d")

def _parse_start(self, item):
"""
Parse the meeting date and set start time to 9am.
"""
datetime_obj = self._parse_date(item)

if datetime_obj is None:
return None

return datetime(self.year, datetime_obj.month, datetime_obj.day, 9)

def _parse_links(self, item, response):
"""
Parse agenda and minutes, if available.
"""
documents = []

agenda_relative_url = item.xpath("a/@href").extract_first()
if agenda_relative_url:
documents.append(
{"href": response.urljoin(agenda_relative_url), "title": "Agenda"}
)

minutes_relative_url = item.xpath(
"following-sibling::ul/li/a/@href"
).extract_first()
if minutes_relative_url:
documents.append(
{"href": response.urljoin(minutes_relative_url), "title": "Minutes"}
)
return documents
def _parse_dates(self, item, year):
"""Parse the date range from the text"""
date_str = item.css("h3::text").extract_first()
start_str = f"{date_str}, {year} {self.start_time}"
end_str = f"{date_str}, {year} {self.end_time}"
start = dateparse(start_str)
end = dateparse(end_str)
return start, end

def _get_base_url(self, response):
return f"{urlparse(response.url).scheme}://{urlparse(response.url).netloc}"

def _parse_links(self, item, base_url):
result = []
links = item.css(".starlist.document li a")
for link in links:
text = link.css("::text").extract_first()
clean_text = re.sub(r"\s+", " ", text)
url = link.css("::attr(href)").extract_first()
url_abs = urljoin(base_url, url)
result.append({"title": clean_text, "href": url_abs})
return result
Loading

0 comments on commit 3f47ca3

Please sign in to comment.