Fix spider: il_pollution_control

City-Bureau · May 8, 2024 · 52af1a4 · 52af1a4
1 parent 3791f27
commit 52af1a4
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 228 deletions.
diff --git a/city_scrapers/spiders/il_pollution_control.py b/city_scrapers/spiders/il_pollution_control.py
@@ -1,214 +1,104 @@
 import json
 import re
 from datetime import datetime
-from io import BytesIO, StringIO
 
 import scrapy
 from city_scrapers_core.constants import BOARD, NOT_CLASSIFIED
 from city_scrapers_core.items import Meeting
 from city_scrapers_core.spiders import CityScrapersSpider
-from pdfminer.high_level import extract_text_to_fp
-from pdfminer.layout import LAParams
 
 
 class IlPollutionControlSpider(CityScrapersSpider):
     name = "il_pollution_control"
     agency = "Illinois Pollution Control Board"
     timezone = "America/Chicago"
+    domain = "https://pcb.illinois.gov"
     start_urls = [
-        "https://pcb.illinois.gov/ClerksOffice/MeetingMinutes",
-        "https://pcb.illinois.gov/CurrentAgendas",
+        domain + "/ClerksOffice/GetCalendarEvents",
+    ]
+    calendar_page = "https://pcb.illinois.gov/ClerksOffice/Calendar"
+    default_links = [
+        {
+            "title": "Agendas",
+            "href": "https://pcb.illinois.gov/CurrentAgendas",
+        },
+        {
+            "title": "Meeting minutes",
+            "href": "https://pcb.illinois.gov/ClerksOffice/MeetingMinutes",
+        },
     ]
-    json_url = "https://pcb.illinois.gov/ClerksOffice/GetCalendarEvents"
-    calendar_url = "https://pcb.illinois.gov/ClerksOffice/Calendar"
-
-    def __init__(self, *args, **kwargs):
-        self.minutes_map = dict()  # Populated by self._parse_minutes()
-        self.agenda_map = dict()  # Populated by self._parse_agenda()
-        self.relevant_years = [
-            str(y) for y in range(datetime.now().year - 1, datetime.now().year + 1)
-        ]
-        super().__init__(*args, **kwargs)
-
-    @classmethod
-    def from_crawler(cls, crawler, *args, **kwargs):
-        """Overridden `from_crawler` to connect `spider_idle` signal."""
-        spider = super().from_crawler(crawler, *args, **kwargs)
-        crawler.signals.connect(spider.spider_idle, signal=scrapy.signals.spider_idle)
-        return spider
-
-    def spider_idle(self):
-        """
-        React to `spider_idle` signal by starting JSON parsing after _parse_minutes.
-        """
-        self.crawler.signals.disconnect(
-            self.spider_idle, signal=scrapy.signals.spider_idle
-        )
-        self.crawler.engine.crawl(
-            scrapy.Request(self.json_url, callback=self._parse_json), self
-        )
-        raise scrapy.exceptions.DontCloseSpider
 
     def parse(self, response):
-        """
-        `parse` should always `yield` Meeting items.
-        """
-        # Gather and store links to meeting minutes:
-        for item in response.xpath("//iframe/@src"):
-            yield scrapy.Request(item.get(), callback=self._parse_minutes)
-
-        # Gather and store link to agenda:
-        for agenda_url in self._parse_agenda_page(response):
-            yield scrapy.Request(agenda_url, callback=self._parse_agenda)
-
-    def _parse_minutes(self, response):
-        """Traverse tree of URLs and populate self.minutes_map"""
-        for item in response.xpath("//td[@class='name']/a"):
-            try:
-                href = item.xpath("@href")[0].get()
-                text = item.xpath("b/text()")[0].get().strip()
-                if not any([(year in text) for year in self.relevant_years]):
-                    continue  # Link does not contain documents from recent years
-                if text[-4:] == ".pdf":
-                    text = text[:-4]
-            except IndexError:
-                continue
-
-            url = response.urljoin(href)
-            if ".pdf" not in url:
-                # Not a link to meeting minutes file - go a level deeper
-                yield scrapy.Request(url, callback=self._parse_minutes)
-            else:
-                # Dates are given in several formats:
-                format_strs = ["%m-%d-%Y", "%m-%d-%y", "%m/%d/%Y", "%m/%d/%y"]
-                dt = None
-                for format_str in format_strs:
-                    try:
-                        dt = datetime.strptime(text, format_str).date()
-                    except ValueError:
-                        continue
-                    else:
-                        break  # Found a format_str that matches - stop looking
-                if dt is None:
-                    continue  # Could not find matching format_str - can't process link.
-
-                self.minutes_map[dt] = url
-
-    def _parse_agenda_page(self, response):
-        """Scrape link to agenda PDF"""
-        for item in response.xpath("//div/div/a"):
-            for _ in item.xpath(".//div/h5[text()='Board Meeting']"):
-                for href in item.xpath("./@href"):
-                    yield href.get()
-
-    def _parse_agenda(self, response):
-        """Parse PDF with agenda for date and store link + date"""
-        # pdf_obj = PdfFileReader(BytesIO(response.body))
-        # pdf_text = pdf_obj.getPage(0).extractText().replace("\n", "")
-        lp = LAParams(line_margin=0.1)
-        out_str = StringIO()
-        extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp)
-        pdf_text = out_str.getvalue().replace("\n", "")
-
-        # Find and extract strings for month/day/year:
-        regex = re.compile(r"(?P<month>[a-zA-Z]+) (?P<day>[0-9]+), (?P<year>[0-9]{4})")
-        m = regex.search(pdf_text)
-
-        try:
-            month = datetime.strptime(m.group("month"), "%B").month
-            day = int(m.group("day"))
-            year = int(m.group("year"))
-            self.agenda_map[datetime(year, month, day).date()] = response.url
-        except AttributeError:  # Regex failed to match.
-            return None
-
-        return None
-
-    def _parse_json(self, response):
-        """
-        Parse JSON from /ClerksOffice/GetCalendarEvents -> Meetings
-        """
         data = json.loads(response.text)
-
         for item in data:
-            if any(
-                s in item["CalendarTypeDesc"].lower()
-                for s in ("holiday", "seminar", "hearing")
-            ):
-                continue  # Not interested in this event type
-
-            title = item["CalendarTypeDesc"].replace("CANCELLED", "").strip()
+            title = item.get("CalendarTypeDesc")
+            if not title or "holiday" in title.lower():
+                continue
             meeting = Meeting(
                 title=title,
-                description="",  # Too inconsistent to parse accurately
-                classification=self._parse_classification(title),
-                start=self._parse_start(item),
-                end=None,
-                all_day=item["IsFullDay"],
+                description=self._parse_description(item),
+                classification=self._parse_classification(item),
+                start=self._parse_datetime(item.get("StartDateTime")),
+                end=self._parse_datetime(item.get("EndDateTime")),
+                all_day=item.get("IsFullDay"),
                 time_notes="",
                 location=self._parse_location(item),
-                links=list(),
-                source=self._parse_source(item, response),
-            )
-
-            meeting["links"] = self._parse_links(meeting)
-            meeting["status"] = self._get_status(
-                meeting,
-                text=" ".join([item["CalendarTypeDesc"], item["Description"]]).lower(),
+                links=self._parse_links(item),
+                source=self.calendar_page,
             )
+            meeting["status"] = self._get_status(meeting, text=item.get("Cancelled"))
             meeting["id"] = self._get_id(meeting)
 
             yield meeting
 
-    def _parse_classification(self, title):
-        """Parse or generate classification from allowed options."""
-        if "Board" in title:
-            return BOARD
-        else:
-            return NOT_CLASSIFIED
+    def _parse_datetime(self, date_str):
+        """Parse the datetime from the string format in the JSON"""
+        if date_str:
+            return datetime.strptime(date_str, "%m/%d/%Y %I:%M:%S %p")
+        return None
 
-    def _parse_start(self, item):
-        return datetime.strptime(item["StartDateTime"], "%m/%d/%Y %I:%M:%S %p")
+    def _parse_description(self, item):
+        """
+        Extract and clean text from HTML description using Scrapy selectors,
+        removing hidden characters and non-standard whitespace.
+        """
+        description_html = item.get("Description", "")
+        selector = scrapy.Selector(text=description_html)
+        text_lines = selector.xpath("//text()").extract()
+        clean_text = " ".join(line.strip() for line in text_lines if line.strip())
+        # Using regex to remove non-printable characters and other unwanted symbols
+        clean_description = re.sub(r"[\x00-\x1f\x7f-\x9f]", "", clean_text)
+        return clean_description
+
+    def _parse_classification(self, item):
+        if "Board" in item.get("CalendarTypeDesc", ""):
+            return BOARD
+        return NOT_CLASSIFIED
 
     def _parse_location(self, item):
-        """Parse or generate location."""
-        text = " ".join([item["Description"], item["Location"]]).lower()
-        if "thompson" in text:
+        if item.get("Location"):
             return {
-                "address": "James R. Thompson Center - 100 W. Randolph St. Suite 11-500, Chicago, IL 60601",  # noqa
-                "name": "Chicago IPCB Office",
-            }
-        elif "springfield" in text or "llinois pollution control board" in text:
-            return {
-                "address": "1021 N. Grand Ave. E. - Room 1244 N, Springfield, IL 62702",
-                "name": "Springfield IPCB Office",
-            }
-        elif "sangamo room" in text:
-            return {
-                "address": "1021 N. Grand Ave. E. - Sangamo Room, Springfield, IL 62702",  # noqa
-                "name": "Illinois EPA",
-            }
-        else:
-            return {
-                "address": "",
                 "name": "",
+                "address": item["Location"].strip(),
             }
-
-    def _parse_links(self, meeting):
-        """Associate Meeting objects with previously-scraped links"""
-        links = list()
-        key = meeting["start"].date()
-        if key in self.minutes_map:
-            links.append({"href": self.minutes_map[key], "title": "Minutes"})
-        if key in self.agenda_map:
-            links.append({"href": self.agenda_map[key], "title": "Agenda"})
-
-        return links
-
-    def _parse_source(self, item, response):
-        """Parse or generate source."""
-        rel_url = scrapy.Selector(text=item["Description"]).xpath(".//a/@href").get()
-        if rel_url:
-            return response.urljoin(rel_url)
-        else:
-            return self.calendar_url
+        return {"name": "No location provided", "address": ""}
+
+    def _parse_links(self, item):
+        """Parse links from description."""
+        description_html = item.get("Description")
+        selector = scrapy.Selector(text=description_html)
+        a_tags = selector.css("a")
+        links = []
+        for a_tag in a_tags:
+            # check if href is relative or absolute and prefix domain if needed
+            href = a_tag.attrib.get("href")
+            href_clean = href if href.startswith("http") else self.domain + href
+            title = a_tag.attrib.get("title")
+            clean_title = title if title else "Related document"
+            link = {
+                "href": href_clean,
+                "title": clean_title,
+            }
+            links.append(link)
+        final_links = self.default_links + links
+        return final_links
diff --git a/tests/files/il_pollution_control.json b/tests/files/il_pollution_control.json