From 9e34262586d7ca027bcf8fabea02248099b0694f Mon Sep 17 00:00:00 2001 From: Daniel Simmons-Ritchie <37225902+SimmonsRitchie@users.noreply.github.com> Date: Wed, 1 May 2024 14:59:59 -0500 Subject: [PATCH] Deprecate spider chi_ssa_25 --- city_scrapers/spiders/chi_ssa_25.py | 100 --- tests/files/chi_ssa_25.html | 1235 --------------------------- tests/test_chi_ssa_25.py | 66 -- 3 files changed, 1401 deletions(-) delete mode 100644 city_scrapers/spiders/chi_ssa_25.py delete mode 100644 tests/files/chi_ssa_25.html delete mode 100644 tests/test_chi_ssa_25.py diff --git a/city_scrapers/spiders/chi_ssa_25.py b/city_scrapers/spiders/chi_ssa_25.py deleted file mode 100644 index 56e1f3600..000000000 --- a/city_scrapers/spiders/chi_ssa_25.py +++ /dev/null @@ -1,100 +0,0 @@ -import html -import json -import re -from datetime import datetime - -from city_scrapers_core.constants import COMMISSION -from city_scrapers_core.items import Meeting -from city_scrapers_core.spiders import CityScrapersSpider -from scrapy import Selector - - -class ChiSsa25Spider(CityScrapersSpider): - name = "chi_ssa_25" - agency = "Chicago Special Service Area #25 Little Village" - timezone = "America/Chicago" - start_urls = ["https://littlevillagechamber.org/ssa-25/meetings-minutes/"] - - def parse(self, response): - """ - `parse` should always `yield` Meeting items. - - Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping - needs. - """ - yield from self._parse_events(response) - # Only parse one previous page of results in addition to the main page - for prev_link in response.css("a.tribe-events-c-nav__prev"): - yield response.follow(prev_link.attrib["href"], callback=self._parse_events) - - def _parse_events(self, response): - for event_link in response.css( - "a.tribe-events-calendar-list__event-title-link" - ): - yield response.follow( - event_link.attrib["href"], callback=self._parse_detail - ) - - def _parse_detail(self, response): - schema_text = response.css( - "[type='application/ld+json']:not(.yoast-schema-graph)::text" - ).extract_first() - if not schema_text: - return - schema_data = json.loads(schema_text)[0] - meeting = Meeting( - title=schema_data["name"], - description=self._parse_description(schema_data), - classification=COMMISSION, - start=self._parse_dt_str(schema_data["startDate"]), - end=self._parse_dt_str(schema_data["endDate"]), - time_notes="", - all_day=False, - location=self._parse_location(schema_data), - links=self._parse_links(response), - source=schema_data["url"], - ) - meeting["status"] = self.get_status(meeting) - meeting["id"] = self.get_id(meeting) - - yield meeting - - def _parse_description(self, item): - desc_sel = Selector(text=html.unescape(item.get("description", ""))) - return re.sub( - r"\s+", " ", " ".join(desc_sel.css("*::text").extract()).replace("\\n", "") - ).strip() - - def _parse_dt_str(self, dt_str): - return datetime.strptime(dt_str[:-6], "%Y-%m-%dT%H:%M:%S") - - def _parse_location(self, item): - location = item["location"] - if "conference call" in location["name"].lower() or "Zoom" in location["name"]: - return { - "name": "Conference Call", - "address": "", - } - loc_addr = location["address"] - addr_str = " ".join( - [ - loc_addr["streetAddress"], - f"{loc_addr.get('addressLocality', 'Chicago')}, {loc_addr.get('addressRegion', 'IL')}", # noqa - loc_addr["postalCode"], - ] - ) - return {"name": location["name"], "address": addr_str} - - def _parse_links(self, response): - """Parse or generate links""" - links = [] - for link in response.css("#primary .fl-row-content-wrap a.uabb-button"): - link_text = " ".join(link.css("*::text").extract()) - if "minutes" in link_text.lower(): - link_title = "Minutes" - else: - link_title = link_text.strip() - links.append( - {"title": link_title, "href": response.urljoin(link.attrib["href"])} - ) - return links diff --git a/tests/files/chi_ssa_25.html b/tests/files/chi_ssa_25.html deleted file mode 100644 index 076624fc3..000000000 --- a/tests/files/chi_ssa_25.html +++ /dev/null @@ -1,1235 +0,0 @@ - - -
- - - - - - -