Merge pull request #1129 from City-Bureau/fix-ssa

🕷️ Fix spider: Chicago Department of Public Health
City-Bureau · Jul 24, 2024 · 3f47ca3 · 3f47ca3
2 parents 6f054cf + c0b31d7
commit 3f47ca3
Show file tree

Hide file tree

Showing 4 changed files with 3,905 additions and 817 deletions.
diff --git a/city_scrapers/spiders/chi_pubhealth.py b/city_scrapers/spiders/chi_pubhealth.py
@@ -1,141 +1,85 @@
 import re
-from datetime import datetime
+from urllib.parse import urljoin, urlparse
 
 from city_scrapers_core.constants import BOARD
 from city_scrapers_core.items import Meeting
 from city_scrapers_core.spiders import CityScrapersSpider
+from dateutil.parser import parse as dateparse
 
 
 class ChiPubHealthSpider(CityScrapersSpider):
     name = "chi_pubhealth"
     agency = "Chicago Department of Public Health"
     timezone = "America/Chicago"
+    start_urls = ["https://www.chicago.gov/city/en/depts/cdph/supp_info.html"]
+    start_time = "9:00 am"
+    end_time = "10:30 am"
+    location = {
+        "name": "DePaul Center",
+        "address": "333 S State St, 2nd Floor, Room 200",
+    }
 
-    @property
-    def start_urls(self):
+    def parse(self, response):
         """
-        DPH generally uses a standard URL format, but sometimes deviates from
-        the pattern. This property inserts the current year into the standard
-        format, as well as known variants, in hopes DPH sticks to one of their
-        conventions and this scraper does not need to be updated annually.
+        Loop over each link to meeting pages for each year
+        and then parse the meeting page.
         """
-        standard_url = "https://www.chicago.gov/city/en/depts/cdph/supp_info/boh/{}-board-of-health-meetings.html"  # noqa
-        url_variant_1 = "https://www.chicago.gov/city/en/depts/cdph/supp_info/boh/{}-board-of-health.html"  # noqa
-
-        # current_year = 2021
-        current_year = datetime.now().year
-
-        return [
-            standard_url.format(current_year),
-            url_variant_1.format(current_year),
-        ]
+        for link in response.css(".list-group-item.list-group-item-action a"):
+            text = link.css("::text").extract_first()
+            if "board of health meetings" in text.lower():
+                url = link.css("::attr(href)").extract_first()
+                yield response.follow(url, callback=self._parse_year_page)
 
-    def parse(self, response):
+    def _parse_year_page(self, response):
         """
-        `parse` should always `yield` Meeting items.
-
-        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
-        needs.
+        Parse the year page and return the list of meetings,
+        which appear to be monthly.
         """
-        title = response.xpath('//h1[@class="page-heading"]/text()').extract_first()
-
-        # Extract year and meeting name from title like "2017 Board of Health Meetings"
-        year_match = re.match(r"\d{4}", title)
-        self.year = int(year_match.group())
-
-        # The description and meeting dates are a series of p elements
-        for idx, item in enumerate(response.css(".page-description-above p"), start=1):
-            if idx == 1:
-                # inspect_response(response, self)
-                # Description is the first p element
-                description = item.xpath("text()").getall()
-                # description = item.xpath("text()").extract_first()
-                if "333 S" not in description[1]:
-                    raise ValueError(description)
-                continue
-
-            # Skip empty rows
-            if not item.css("*::text").extract_first().strip():
+        page_title = response.css("h1.page-heading::text").extract_first()
+        year = re.search(r"\d{4}", page_title).group()
+        base_url = self._get_base_url(response)
+        for item in response.css(".accordion-item.card.section"):
+            links = self._parse_links(item, base_url)
+            # if there are no links then assume that month's meeting
+            # has been cancelled
+            if not links:
                 continue
-
-            start = self._parse_start(item)
-            if start is None:
-                continue
-
+            start, end = self._parse_dates(item, year)
             meeting = Meeting(
-                title="Board of Health",
+                title="Board of Health meeting",
                 description="",
                 classification=BOARD,
                 start=start,
-                end=None,
-                time_notes="",
+                end=end,
                 all_day=False,
-                location={
-                    "name": "2nd Floor Board Room, DePaul Center",
-                    "address": "333 S State St, Chicago, IL 60604",
-                },
-                links=self._parse_links(item, response),
+                time_notes="",
+                location=self.location,
+                links=links,
                 source=response.url,
             )
+            meeting["status"] = self._get_status(meeting)
             meeting["id"] = self._get_id(meeting)
-            meeting["status"] = self._get_status(
-                meeting, text=" ".join(item.css("*::text").extract())
-            )
             yield meeting
 
-    def _parse_date(self, item):
-        """
-        Parse the meeting date.
-        """
-        # Future meetings are plain text
-        date_text = (item.xpath("text()").extract_first() or "").strip()
-
-        if not date_text:
-            # Past meetings are links to the agenda
-            date_text = item.xpath("a/text()").extract_first()
-        if date_text is None:
-            return None
-        # Remove extra whitespace characters
-        date_text = re.sub(r"\s+", " ", str(date_text)).strip()
-
-        # Handle typos like "December18"
-        if re.match(r"[a-zA-Z]+\d+", str(date_text)):
-            date_match = re.search(r"(?P<month>[a-zA-Z]+)(?P<day>\d+)", date_text)
-            date_text = "{} {}".format(
-                date_match.group("month"), date_match.group("day")
-            )
-        # Extract date formatted like "January 12"
-
-        return datetime.strptime(date_text, "%B %d")
-
-    def _parse_start(self, item):
-        """
-        Parse the meeting date and set start time to 9am.
-        """
-        datetime_obj = self._parse_date(item)
-
-        if datetime_obj is None:
-            return None
-
-        return datetime(self.year, datetime_obj.month, datetime_obj.day, 9)
-
-    def _parse_links(self, item, response):
-        """
-        Parse agenda and minutes, if available.
-        """
-        documents = []
-
-        agenda_relative_url = item.xpath("a/@href").extract_first()
-        if agenda_relative_url:
-            documents.append(
-                {"href": response.urljoin(agenda_relative_url), "title": "Agenda"}
-            )
-
-        minutes_relative_url = item.xpath(
-            "following-sibling::ul/li/a/@href"
-        ).extract_first()
-        if minutes_relative_url:
-            documents.append(
-                {"href": response.urljoin(minutes_relative_url), "title": "Minutes"}
-            )
-        return documents
+    def _parse_dates(self, item, year):
+        """Parse the date range from the text"""
+        date_str = item.css("h3::text").extract_first()
+        start_str = f"{date_str}, {year} {self.start_time}"
+        end_str = f"{date_str}, {year} {self.end_time}"
+        start = dateparse(start_str)
+        end = dateparse(end_str)
+        return start, end
+
+    def _get_base_url(self, response):
+        return f"{urlparse(response.url).scheme}://{urlparse(response.url).netloc}"
+
+    def _parse_links(self, item, base_url):
+        result = []
+        links = item.css(".starlist.document li a")
+        for link in links:
+            text = link.css("::text").extract_first()
+            clean_text = re.sub(r"\s+", " ", text)
+            url = link.css("::attr(href)").extract_first()
+            url_abs = urljoin(base_url, url)
+            result.append({"title": clean_text, "href": url_abs})
+        return result