From cb2e83fe68ac9f44619672796f9f484d3ef70d7c Mon Sep 17 00:00:00 2001 From: Daniel Simmons-Ritchie <37225902+SimmonsRitchie@users.noreply.github.com> Date: Tue, 30 Apr 2024 14:54:46 -0500 Subject: [PATCH] Deprecate spider chi_license_appeal --- city_scrapers/spiders/chi_license_appeal.py | 93 -- tests/files/chi_license_appeal.html | 1656 ------------------- tests/test_chi_license_appeal.py | 80 - 3 files changed, 1829 deletions(-) delete mode 100644 city_scrapers/spiders/chi_license_appeal.py delete mode 100644 tests/files/chi_license_appeal.html delete mode 100644 tests/test_chi_license_appeal.py diff --git a/city_scrapers/spiders/chi_license_appeal.py b/city_scrapers/spiders/chi_license_appeal.py deleted file mode 100644 index f9b27cc47..000000000 --- a/city_scrapers/spiders/chi_license_appeal.py +++ /dev/null @@ -1,93 +0,0 @@ -import re -from datetime import datetime - -from city_scrapers_core.constants import COMMISSION -from city_scrapers_core.items import Meeting -from city_scrapers_core.spiders import CityScrapersSpider - - -class ChiLicenseAppealSpider(CityScrapersSpider): - name = "chi_license_appeal" - agency = "Chicago License Appeal Commission" - timezone = "America/Chicago" - start_urls = ["https://www.chicago.gov/city/en/depts/lac/supp_info.html"] - location = { - "name": "Richard J Daley Center", - "address": "50 W Washington St, LL 02 Chicago, IL 60602", - } - - def parse(self, response): - """Get all meeting schedule links (since years in URL aren't reliable)""" - for link in response.css(".page-center .list-supporting-info a")[:10]: - link_text = " ".join(link.css("*::text").extract()) - if "schedule" in link_text.lower(): - yield response.follow( - link.attrib["href"], callback=self._parse_meetings, dont_filter=True - ) - - def _parse_meetings(self, response): - """ - `_parse_meetings` should always `yield` Meeting items. - - Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping - needs. - """ - response_text = " ".join( - response.css(".page-full-description *::text").extract() - ) - self._validate_location(response_text) - year_str = re.search( - r"\d{4}", response.css("h1.page-heading::text").extract_first() - ).group() - time_str = ( - re.search(r"\d{1,2}:\d{2} [apm\.]{2,4}", response_text) - .group() - .replace(".", "") - ) - - for item in response.css(".page-full-description li"): - item_text = " ".join(item.css("*::text").extract()) - meeting = Meeting( - title="License Appeal Commission", - description="", - classification=COMMISSION, - start=self._parse_start(item_text, time_str, year_str), - end=None, - all_day=False, - time_notes="", - location=self.location, - links=self._parse_links(item, response), - source=response.url, - ) - - meeting["status"] = self._get_status(meeting, text=item_text) - meeting["id"] = self._get_id(meeting) - - yield meeting - - def _parse_start(self, item_text, time_str, year_str): - """Parse start datetime as a naive datetime object.""" - date_str = re.sub( - r"\s+", " ", re.search(r"[a-zA-Z]{3,10}\s+\d{1,2}", item_text).group() - ) - return datetime.strptime(date_str + year_str + time_str, "%B %d%Y%I:%M %p") - - def _validate_location(self, response_text): - """Check that location hasn't changed""" - if "50 W" not in response_text: - raise ValueError("Meeting location has changed") - - def _parse_links(self, item, response): - """ - Parse or generate links. No links are currently present, but it's similar to - other pages where they are sometimes posted inside the li tag. - """ - links = [] - for link in item.css("a"): - links.append( - { - "title": " ".join(link.css("*::text").extract()).strip(), - "href": response.urljoin(link.attrib["href"]), - } - ) - return links diff --git a/tests/files/chi_license_appeal.html b/tests/files/chi_license_appeal.html deleted file mode 100644 index 6b969d5ce..000000000 --- a/tests/files/chi_license_appeal.html +++ /dev/null @@ -1,1656 +0,0 @@ - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - --
Status call starts at 11:00 am
-