From 6e5b39664edc80bde8119b523165a035dccc793d Mon Sep 17 00:00:00 2001 From: Daniel Simmons-Ritchie <37225902+SimmonsRitchie@users.noreply.github.com> Date: Tue, 30 Apr 2024 15:52:33 -0500 Subject: [PATCH] Deprecate spider chi_ssa_52 --- city_scrapers/spiders/chi_ssa_52.py | 142 ------- tests/files/chi_ssa_52.html | 609 ---------------------------- tests/test_chi_ssa_52.py | 74 ---- 3 files changed, 825 deletions(-) delete mode 100644 city_scrapers/spiders/chi_ssa_52.py delete mode 100644 tests/files/chi_ssa_52.html delete mode 100644 tests/test_chi_ssa_52.py diff --git a/city_scrapers/spiders/chi_ssa_52.py b/city_scrapers/spiders/chi_ssa_52.py deleted file mode 100644 index 6e9454fef..000000000 --- a/city_scrapers/spiders/chi_ssa_52.py +++ /dev/null @@ -1,142 +0,0 @@ -import re -from datetime import datetime -from difflib import SequenceMatcher - -from city_scrapers_core.constants import COMMISSION -from city_scrapers_core.items import Meeting -from city_scrapers_core.spiders import CityScrapersSpider - - -class ChiSsa52Spider(CityScrapersSpider): - name = "chi_ssa_52" - agency = "Chicago Special Service Area #52 51st Street" - timezone = "America/Chicago" - start_urls = ["https://www.51ststreetchicago.com/about.html"] - - def parse(self, response): - """ - `parse` should always `yield` Meeting items. - - Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping - needs. - """ - - items = response.css("div.paragraph")[3:4] - title = items.css("strong::text").get() - meeting = items.css("ul")[0] - - item = (title, meeting) - - for meet in meeting.css("li"): - meet = self._clean_meet(meet) - - meeting = Meeting( - title=self._parse_title(title), - description=self._parse_description(item), - classification=self._parse_classification(item), - start=self._parse_start(meet), - end=self._parse_end(item), - all_day=self._parse_all_day(item), - time_notes=self._parse_time_notes(item), - location=self._parse_location(item), - links=self._parse_links(item), - source=self._parse_source(response), - ) - - meeting["status"] = self._get_status(meeting) - meeting["id"] = self._get_id(meeting) - - yield meeting - - def _clean_meet(self, meet): - """Clean a meet datetime info and group the info""" - meet = meet.css("::text").get() - meet = meet.replace("\xa0", "") - - clean_str = re.sub(r"[^\w:]+", " ", meet) - meet_info = clean_str.split() - - return meet_info - - def _check_am_pm(self, time): - time = time.split(":") - hour = time[0] - minutes = time[1] - - if int(hour) >= 8 and int(hour) <= 12: - return f"{hour}:{minutes} AM" - return f"{hour}:{minutes} PM" - - def _parse_title(self, item): - """Parse or generate meeting title.""" - return "Commission" - - def _parse_description(self, item): - """Parse or generate meeting description.""" - return "" - - def _parse_classification(self, item): - """Parse or generate classification from allowed options.""" - return COMMISSION - - def _parse_start(self, item): - """Parse start datetime as a naive datetime object.""" - months = [ - "JANUARY", - "FEBRUARY", - "MARCH", - "APRIL", - "MAY", - "JUNE", - "JULY", - "AUGUST", - "SEPTEMBER", - "OCTOBER", - "NOVEMBER", - "DECEMBER", - ] - - time = item[4] - time = self._check_am_pm(time) - - try: - date = datetime.strptime( - f"{item[2]} {item[1]} {item[3]} {time}", - "%d %B %Y %I:%M %p", - ) - except ValueError: - for month in months: - ratio = SequenceMatcher(None, month, item[1]).ratio() - if ratio > 0.5: - date = datetime.strptime( - f"{item[2]} {month} {item[3]} {time}", - "%d %B %Y %I:%M %p", - ) - return date - - def _parse_end(self, item): - """Parse end datetime as a naive datetime object. Added by pipeline if None""" - return None - - def _parse_time_notes(self, item): - """Parse any additional notes on the timing of the meeting""" - return "" - - def _parse_all_day(self, item): - """Parse or generate all-day status. Defaults to False.""" - return False - - def _parse_location(self, item): - """Parse or generate location.""" - return { - "address": "220 E 51st St Chicago, IL 60615", - "name": "51st Street Business Association", - } - - def _parse_links(self, item): - """Parse or generate links.""" - return [] - - def _parse_source(self, response): - """Parse or generate source.""" - return response.url diff --git a/tests/files/chi_ssa_52.html b/tests/files/chi_ssa_52.html deleted file mode 100644 index abc06bffc..000000000 --- a/tests/files/chi_ssa_52.html +++ /dev/null @@ -1,609 +0,0 @@ - - - - About - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/tests/test_chi_ssa_52.py b/tests/test_chi_ssa_52.py deleted file mode 100644 index bd6904cff..000000000 --- a/tests/test_chi_ssa_52.py +++ /dev/null @@ -1,74 +0,0 @@ -from datetime import datetime -from os.path import dirname, join - -import pytest -from city_scrapers_core.constants import COMMISSION -from city_scrapers_core.utils import file_response -from freezegun import freeze_time - -from city_scrapers.spiders.chi_ssa_52 import ChiSsa52Spider - -test_response = file_response( - join(dirname(__file__), "files", "chi_ssa_52.html"), - url="https://www.51ststreetchicago.com/about.html", -) -spider = ChiSsa52Spider() - -freezer = freeze_time("2020-09-14") -freezer.start() - -parsed_items = [item for item in spider.parse(test_response)] - -freezer.stop() - - -def test_title(): - assert parsed_items[0]["title"] == "Commission" - - -def test_description(): - assert parsed_items[0]["description"] == "" - - -def test_start(): - assert parsed_items[0]["start"] == datetime(2020, 2, 18, 13, 30) - - -# def test_end(): -# assert parsed_items[0]["end"] == datetime(2019, 1, 1, 0, 0) - - -def test_time_notes(): - assert parsed_items[0]["time_notes"] == "" - - -def test_id(): - assert parsed_items[0]["id"] == "chi_ssa_52/202002181330/x/commission" - - -def test_status(): - assert parsed_items[0]["status"] == "passed" - - -def test_location(): - assert parsed_items[0]["location"] == { - "name": "51st Street Business Association", - "address": "220 E 51st St Chicago, IL 60615", - } - - -def test_source(): - assert parsed_items[0]["source"] == "https://www.51ststreetchicago.com/about.html" - - -def test_links(): - assert parsed_items[0]["links"] == [] - - -def test_classification(): - assert parsed_items[0]["classification"] == COMMISSION - - -@pytest.mark.parametrize("item", parsed_items) -def test_all_day(item): - assert item["all_day"] is False