From 9e34262586d7ca027bcf8fabea02248099b0694f Mon Sep 17 00:00:00 2001 From: Daniel Simmons-Ritchie <37225902+SimmonsRitchie@users.noreply.github.com> Date: Wed, 1 May 2024 14:59:59 -0500 Subject: [PATCH] Deprecate spider chi_ssa_25 --- city_scrapers/spiders/chi_ssa_25.py | 100 --- tests/files/chi_ssa_25.html | 1235 --------------------------- tests/test_chi_ssa_25.py | 66 -- 3 files changed, 1401 deletions(-) delete mode 100644 city_scrapers/spiders/chi_ssa_25.py delete mode 100644 tests/files/chi_ssa_25.html delete mode 100644 tests/test_chi_ssa_25.py diff --git a/city_scrapers/spiders/chi_ssa_25.py b/city_scrapers/spiders/chi_ssa_25.py deleted file mode 100644 index 56e1f3600..000000000 --- a/city_scrapers/spiders/chi_ssa_25.py +++ /dev/null @@ -1,100 +0,0 @@ -import html -import json -import re -from datetime import datetime - -from city_scrapers_core.constants import COMMISSION -from city_scrapers_core.items import Meeting -from city_scrapers_core.spiders import CityScrapersSpider -from scrapy import Selector - - -class ChiSsa25Spider(CityScrapersSpider): - name = "chi_ssa_25" - agency = "Chicago Special Service Area #25 Little Village" - timezone = "America/Chicago" - start_urls = ["https://littlevillagechamber.org/ssa-25/meetings-minutes/"] - - def parse(self, response): - """ - `parse` should always `yield` Meeting items. - - Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping - needs. - """ - yield from self._parse_events(response) - # Only parse one previous page of results in addition to the main page - for prev_link in response.css("a.tribe-events-c-nav__prev"): - yield response.follow(prev_link.attrib["href"], callback=self._parse_events) - - def _parse_events(self, response): - for event_link in response.css( - "a.tribe-events-calendar-list__event-title-link" - ): - yield response.follow( - event_link.attrib["href"], callback=self._parse_detail - ) - - def _parse_detail(self, response): - schema_text = response.css( - "[type='application/ld+json']:not(.yoast-schema-graph)::text" - ).extract_first() - if not schema_text: - return - schema_data = json.loads(schema_text)[0] - meeting = Meeting( - title=schema_data["name"], - description=self._parse_description(schema_data), - classification=COMMISSION, - start=self._parse_dt_str(schema_data["startDate"]), - end=self._parse_dt_str(schema_data["endDate"]), - time_notes="", - all_day=False, - location=self._parse_location(schema_data), - links=self._parse_links(response), - source=schema_data["url"], - ) - meeting["status"] = self.get_status(meeting) - meeting["id"] = self.get_id(meeting) - - yield meeting - - def _parse_description(self, item): - desc_sel = Selector(text=html.unescape(item.get("description", ""))) - return re.sub( - r"\s+", " ", " ".join(desc_sel.css("*::text").extract()).replace("\\n", "") - ).strip() - - def _parse_dt_str(self, dt_str): - return datetime.strptime(dt_str[:-6], "%Y-%m-%dT%H:%M:%S") - - def _parse_location(self, item): - location = item["location"] - if "conference call" in location["name"].lower() or "Zoom" in location["name"]: - return { - "name": "Conference Call", - "address": "", - } - loc_addr = location["address"] - addr_str = " ".join( - [ - loc_addr["streetAddress"], - f"{loc_addr.get('addressLocality', 'Chicago')}, {loc_addr.get('addressRegion', 'IL')}", # noqa - loc_addr["postalCode"], - ] - ) - return {"name": location["name"], "address": addr_str} - - def _parse_links(self, response): - """Parse or generate links""" - links = [] - for link in response.css("#primary .fl-row-content-wrap a.uabb-button"): - link_text = " ".join(link.css("*::text").extract()) - if "minutes" in link_text.lower(): - link_title = "Minutes" - else: - link_title = link_text.strip() - links.append( - {"title": link_title, "href": response.urljoin(link.attrib["href"])} - ) - return links diff --git a/tests/files/chi_ssa_25.html b/tests/files/chi_ssa_25.html deleted file mode 100644 index 076624fc3..000000000 --- a/tests/files/chi_ssa_25.html +++ /dev/null @@ -1,1235 +0,0 @@ - - - - - - - - - - November SSA Meeting 2019 | Little Village Chamber of Commerce - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
-
-
-
-
- -
-
- -
-
-
-
-
-
-
- - - - -
-
- -
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- - - - - - -
-
-
-
-
-
-
-
- -
- - -
- -
- - -
-
Loading Events
-
-
-
-
- -
-
-
-
-
-
-
-
-

November SSA Meeting 2019

-
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
-
-
- - - -
-
-
-
-
-
-
-
-
-
-
-
-
- - -
-
-
-
-
-
- -
-

Details

-
- - -
Date:
-
- November 19, 2019 -
- -
Time:
-
-
- 9:00 am - 10:00 am
-
- - - -
Event Category:
- - -
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
-
-
-
- -
-

Venue

-
- -
Second Federal A Division of Self-Help FCU
- -
-
- - -3960 w 26th st - -
- chicago, - - IL - - 60623 - - United States - -
- - + Google Map
-
- - - -
-
-
-
-
-
-
-
-
-
- -
- -
-
-
-
-
-
-
-
-
-
- - -
- - -
- -
- - - - - -
- - -
-
- - en_USEnglish -
-
- - es_MXSpanish - - en_USEnglish -
-
- - - - Scroll to Top - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/tests/test_chi_ssa_25.py b/tests/test_chi_ssa_25.py deleted file mode 100644 index 7d0b714b5..000000000 --- a/tests/test_chi_ssa_25.py +++ /dev/null @@ -1,66 +0,0 @@ -from datetime import datetime -from os.path import dirname, join - -import pytest # noqa -from city_scrapers_core.constants import COMMISSION, PASSED -from city_scrapers_core.utils import file_response -from freezegun import freeze_time - -from city_scrapers.spiders.chi_ssa_25 import ChiSsa25Spider - -test_response = file_response( - join(dirname(__file__), "files", "chi_ssa_25.html"), - url="https://littlevillagechamber.org/calendar/november-ssa-meeting-2019/", -) -spider = ChiSsa25Spider() - -freezer = freeze_time("2020-08-07") -freezer.start() - -parsed_item = [item for item in spider._parse_detail(test_response)][0] - -freezer.stop() - - -def test_title(): - assert parsed_item["title"] == "November SSA Meeting 2019" - - -def test_start(): - assert parsed_item["start"] == datetime(2019, 11, 19, 9) - - -def test_end(): - assert parsed_item["end"] == datetime(2019, 11, 19, 10) - - -def test_id(): - assert parsed_item["id"] == "chi_ssa_25/201911190900/x/november_ssa_meeting_2019" - - -def test_status(): - assert parsed_item["status"] == PASSED - - -def test_location(): - assert parsed_item["location"] == { - "name": "Second Federal A Division of Self-Help FCU", - "address": "3960 w 26th st chicago, IL 60623", - } - - -def test_source(): - assert parsed_item["source"] == test_response.url - - -def test_links(): - assert parsed_item["links"] == [ - { - "href": "https://littlevillagechamber.org/wp-content/uploads/Nov-15-2019-SSA-25-Meeting-Minutes.pdf", # noqa - "title": "Minutes", - } - ] - - -def test_classification(): - assert parsed_item["classification"] == COMMISSION