From cb2e83fe68ac9f44619672796f9f484d3ef70d7c Mon Sep 17 00:00:00 2001 From: Daniel Simmons-Ritchie <37225902+SimmonsRitchie@users.noreply.github.com> Date: Tue, 30 Apr 2024 14:54:46 -0500 Subject: [PATCH] Deprecate spider chi_license_appeal --- city_scrapers/spiders/chi_license_appeal.py | 93 -- tests/files/chi_license_appeal.html | 1656 ------------------- tests/test_chi_license_appeal.py | 80 - 3 files changed, 1829 deletions(-) delete mode 100644 city_scrapers/spiders/chi_license_appeal.py delete mode 100644 tests/files/chi_license_appeal.html delete mode 100644 tests/test_chi_license_appeal.py diff --git a/city_scrapers/spiders/chi_license_appeal.py b/city_scrapers/spiders/chi_license_appeal.py deleted file mode 100644 index f9b27cc47..000000000 --- a/city_scrapers/spiders/chi_license_appeal.py +++ /dev/null @@ -1,93 +0,0 @@ -import re -from datetime import datetime - -from city_scrapers_core.constants import COMMISSION -from city_scrapers_core.items import Meeting -from city_scrapers_core.spiders import CityScrapersSpider - - -class ChiLicenseAppealSpider(CityScrapersSpider): - name = "chi_license_appeal" - agency = "Chicago License Appeal Commission" - timezone = "America/Chicago" - start_urls = ["https://www.chicago.gov/city/en/depts/lac/supp_info.html"] - location = { - "name": "Richard J Daley Center", - "address": "50 W Washington St, LL 02 Chicago, IL 60602", - } - - def parse(self, response): - """Get all meeting schedule links (since years in URL aren't reliable)""" - for link in response.css(".page-center .list-supporting-info a")[:10]: - link_text = " ".join(link.css("*::text").extract()) - if "schedule" in link_text.lower(): - yield response.follow( - link.attrib["href"], callback=self._parse_meetings, dont_filter=True - ) - - def _parse_meetings(self, response): - """ - `_parse_meetings` should always `yield` Meeting items. - - Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping - needs. - """ - response_text = " ".join( - response.css(".page-full-description *::text").extract() - ) - self._validate_location(response_text) - year_str = re.search( - r"\d{4}", response.css("h1.page-heading::text").extract_first() - ).group() - time_str = ( - re.search(r"\d{1,2}:\d{2} [apm\.]{2,4}", response_text) - .group() - .replace(".", "") - ) - - for item in response.css(".page-full-description li"): - item_text = " ".join(item.css("*::text").extract()) - meeting = Meeting( - title="License Appeal Commission", - description="", - classification=COMMISSION, - start=self._parse_start(item_text, time_str, year_str), - end=None, - all_day=False, - time_notes="", - location=self.location, - links=self._parse_links(item, response), - source=response.url, - ) - - meeting["status"] = self._get_status(meeting, text=item_text) - meeting["id"] = self._get_id(meeting) - - yield meeting - - def _parse_start(self, item_text, time_str, year_str): - """Parse start datetime as a naive datetime object.""" - date_str = re.sub( - r"\s+", " ", re.search(r"[a-zA-Z]{3,10}\s+\d{1,2}", item_text).group() - ) - return datetime.strptime(date_str + year_str + time_str, "%B %d%Y%I:%M %p") - - def _validate_location(self, response_text): - """Check that location hasn't changed""" - if "50 W" not in response_text: - raise ValueError("Meeting location has changed") - - def _parse_links(self, item, response): - """ - Parse or generate links. No links are currently present, but it's similar to - other pages where they are sometimes posted inside the li tag. - """ - links = [] - for link in item.css("a"): - links.append( - { - "title": " ".join(link.css("*::text").extract()).strip(), - "href": response.urljoin(link.attrib["href"]), - } - ) - return links diff --git a/tests/files/chi_license_appeal.html b/tests/files/chi_license_appeal.html deleted file mode 100644 index 6b969d5ce..000000000 --- a/tests/files/chi_license_appeal.html +++ /dev/null @@ -1,1656 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - City of Chicago :: 2019 Meeting Schedule - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- -
-
- -
-
- - -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - -
- - -
- - - - -
- - - -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-

2019 Meeting Schedule

-
-
- - - - - -
- - - - - - -
- - - -
-
-
-
    -
  •  January 16
  • -
  • February 6
  • -
  • March 13
  • -
  • April 10
  • -
  • May 15
  • -
  • June 12
  • -
  • July 17
  • -
  • August 14
  • -
  • September 18 
  • -
  • October 23
  • -
  • November 20
  • -
  • December 18
  • -
-

 

-

Status call starts at 11:00 am 

-
Richard J. Daley Center
-
50 W. Washington Street, LL 02 (Lower Level)
-
 
-
-
-
- - - - - - - - - - - - - - - -
- - - - - - - - - -
- - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
-
-
-

 Supporting Information Facts

-
-
- - - - - - -

Department:

- - - - -
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - -
-
-
-
-
-

 I Want To

-
-
- -
- - - - - - - - - - - - - - - - - - - - - -
- - -
-
-
-
-
- - -

- - - - - - - - - - - - -
- -
-
- - - -
-
-
- - - - - -
- - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/tests/test_chi_license_appeal.py b/tests/test_chi_license_appeal.py deleted file mode 100644 index a68fd5011..000000000 --- a/tests/test_chi_license_appeal.py +++ /dev/null @@ -1,80 +0,0 @@ -from datetime import datetime -from os.path import dirname, join - -import pytest # noqa -from city_scrapers_core.constants import COMMISSION, PASSED -from city_scrapers_core.utils import file_response -from freezegun import freeze_time - -from city_scrapers.spiders.chi_license_appeal import ChiLicenseAppealSpider - -test_response = file_response( - join(dirname(__file__), "files", "chi_license_appeal.html"), - url="https://www.chicago.gov/city/en/depts/lac/supp_info/2009hearings.html", -) -spider = ChiLicenseAppealSpider() - -freezer = freeze_time("2019-09-18") -freezer.start() - -parsed_items = [item for item in spider._parse_meetings(test_response)] - -freezer.stop() - - -def test_count(): - assert len(parsed_items) == 12 - - -def test_title(): - assert parsed_items[0]["title"] == "License Appeal Commission" - - -def test_description(): - assert parsed_items[0]["description"] == "" - - -def test_start(): - assert parsed_items[0]["start"] == datetime(2019, 1, 16, 11, 0) - - -def test_end(): - assert parsed_items[0]["end"] is None - - -def test_time_notes(): - assert parsed_items[0]["time_notes"] == "" - - -def test_id(): - assert ( - parsed_items[0]["id"] - == "chi_license_appeal/201901161100/x/license_appeal_commission" - ) - - -def test_status(): - assert parsed_items[0]["status"] == PASSED - - -def test_location(): - assert parsed_items[0]["location"] == spider.location - - -def test_source(): - assert ( - parsed_items[0]["source"] - == "https://www.chicago.gov/city/en/depts/lac/supp_info/2009hearings.html" - ) - - -def test_links(): - assert parsed_items[0]["links"] == [] - - -def test_classification(): - assert parsed_items[0]["classification"] == COMMISSION - - -def test_all_day(): - assert parsed_items[0]["all_day"] is False