diff --git a/city_scrapers/spiders/cook_justice_advisory.py b/city_scrapers/spiders/cook_justice_advisory.py deleted file mode 100644 index ce79b6028..000000000 --- a/city_scrapers/spiders/cook_justice_advisory.py +++ /dev/null @@ -1,189 +0,0 @@ -import re -from collections import defaultdict -from datetime import datetime - -import scrapy -from city_scrapers_core.constants import ADVISORY_COMMITTEE -from city_scrapers_core.items import Meeting -from city_scrapers_core.spiders import CityScrapersSpider -from dateutil.relativedelta import relativedelta - - -class CookJusticeAdvisorySpider(CityScrapersSpider): - name = "cook_justice_advisory" - agency = "Cook County Justice Advisory Council" - timezone = "America/Chicago" - start_urls = [ - "https://www.cookcountyil.gov/service/justice-advisory-council-meetings" - ] - - def __init__(self, *args, **kwargs): - self.agenda_map = defaultdict(list) - super().__init__(*args, **kwargs) - - def _parse_meetings_page(self, response): - today = datetime.now() - for month_delta in range(-3, 6): - mo_str = (today + relativedelta(months=month_delta)).strftime("%Y-%m") - url = ( - "https://www.cookcountyil.gov/" - "calendar-node-field-date/month/{}".format(mo_str) - ) - yield scrapy.Request( - url=url, method="GET", callback=self._parse_events_page - ) - - def _parse_events_page(self, response): - for url in self._get_event_urls(response): - yield scrapy.Request(url, callback=self._parse_event, dont_filter=True) - - def parse(self, response): - """ - `parse` should always `yield` Meeting items. - - Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping - needs. - """ - self._parse_links(response) - yield from self._parse_meetings_page(response) - - def _parse_event(self, response): - """Parse the event page.""" - title = self._parse_title(response) - start = self._parse_start(response) - links_key = start.strftime("%y-%m") - - meeting = Meeting( - title=title, - description=self._parse_description(response), - classification=ADVISORY_COMMITTEE, - start=start, - end=self._parse_end(response), - time_notes="", - all_day=self._parse_all_day(response), - location=self._parse_location(response), - links=self.agenda_map[links_key], - source=response.url, - ) - - meeting["id"] = self._get_id(meeting) - meeting["status"] = self._get_status(meeting) - - return meeting - - def _get_event_urls(self, response): - """ - Get urls for all justice advisory council (JAC in calendar) meetings on the page - """ - responses = [] - events = response.xpath("//a[contains(@href, 'event')]") - for event in events: - event_title = event.xpath("text()").extract_first().lower() - if ( - "jac" in event_title - or "justice advisory" in event_title - or ("justice" in event_title and "advisory" in event_title) - ): - href = event.xpath("./@href").extract_first() - responses.append(response.urljoin(href)) - return responses - - def _parse_location(self, response): - """ - Parse or generate location. - """ - address = response.xpath( - '//div[@class="field event-location"]/descendant::*/text()' - ).extract() - for word in ["Location:", ", ", " "]: - address.remove(word) - address = " ".join(address) - if "Microsoft Teams" in address: - return { - "address": "", - "name": "", - } - else: - return { - "address": address, - "name": "", - } - - def _parse_all_day(self, response): - """ - Parse or generate all-day status. Defaults to false. - """ - date = response.xpath( - '//span[@class="date-display-single"]/descendant-or-self::*/text()' - ).extract() - date = "".join(date).upper() - return "ALL DAY" in date - - def _parse_title(self, response): - """Parse or generate event""" - title = "".join(response.css("h1::text").extract()) - if "JAC Council Meeting" in title: - return "Justice Advisory Council" - else: - return title - - def _parse_description(self, response): - """Parse or generate event description.""" - category_field = response.xpath( - "//div[contains(., 'Category:') and contains(@class, 'field-label')]" - ) - field_items = category_field.xpath( - "./following::div[contains(@class, 'field-items')]" - ) - return " ".join( - field_items.xpath(".//p/text()").extract() - + field_items.xpath(".//strong/text()").extract() - ).strip() - - def _parse_start(self, response): - """Parse start date and time""" - start = response.xpath( - '//span[@class="date-display-single"]/descendant-or-self::*/text()' - ).extract() - start = "".join(start).upper() - start = start.split(" TO ")[0].strip() - start = start.replace("(ALL DAY)", "12:00AM") - - return datetime.strptime(start, "%B %d, %Y %I:%M%p") - - def _parse_end(self, response): - """Parse end date and time""" - date = response.xpath( - '//span[@class="date-display-single"]/descendant-or-self::*/text()' - ).extract() - date = "".join(date).upper() - date.replace("(ALL DAY)", "TO 11:59PM") - start_end = date.split(" TO ") - - if len(start_end) < 2: - return - - end_time = start_end[1] - date = start_end[0][: start_end[0].rindex(" ")] - return datetime.strptime("{} {}".format(date, end_time), "%B %d, %Y %I:%M%p") - - def _parse_links(self, response): - """Parse links""" - links = response.css("span.file a") - links = links[2:] - for link in links: - link_name = link.xpath("text()").extract_first() - link_name = link_name.replace("\xa0", " ") - link_path = link.xpath("./@href").extract_first() - pattern = r"(?P[a-zA-Z]+)( *)(?P[a-zA-Z]+)( *)(?P\d{4})" - regex = re.search(pattern, link_name) - if regex is not None: - raw_monthyear = regex.group("m") + " " + regex.group("y") - if len(regex.group("m")) < 4: - date_obj = datetime.strptime(raw_monthyear, "%b %Y") - else: - date_obj = datetime.strptime(raw_monthyear, "%B %Y") - formatted_date = datetime.strftime(date_obj, "%y-%m") - self.agenda_map[formatted_date].append( - {"href": link_path, "title": regex.group("type")} - ) diff --git a/tests/files/cook_justice_advisory.html b/tests/files/cook_justice_advisory.html deleted file mode 100644 index b0329e09d..000000000 --- a/tests/files/cook_justice_advisory.html +++ /dev/null @@ -1,855 +0,0 @@ - - - - - - - - JAC Council Meeting | CookCountyIL.gov - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - -
- - - - - - - -
- -
-
- -
-
- - -

JAC Council Meeting

- -
- -
- - -
-
Location:
-
69 W. Washington
22nd Floor Conference Room D
Chicago, IL 60602
- - -
-
When:
-
January 9, 2020
8:30am to 10:00am
- - -
-
Category:
-
- -
-

JAC Council Meeting

- - -
-
-
-
- -
-
- - - - - -
-
-
-
- - - -
- - - -
- - - - - - - - - -

Original text


ShareThis Copy and Paste
\ No newline at end of file diff --git a/tests/files/cook_justice_advisory_details.html b/tests/files/cook_justice_advisory_details.html deleted file mode 100644 index 122df4645..000000000 --- a/tests/files/cook_justice_advisory_details.html +++ /dev/null @@ -1,1468 +0,0 @@ - - - - - - - - Justice Advisory Council Meetings | CookCountyIL.gov - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - -
- - - - - - - -
- -
-
- -
-

Justice Advisory Council Board Members 2019 -

- -

-  -

- -
- -

Justice Advisory Council January 2017 Staff Report

- - -
- FileJustice Advisory Council January 2017 Staff Report
Download Title: 
Justice Advisory Council January 2017 Staff Report
- - -
-
- -

Justice Advisory Council Meeting Agendas 2020

- - - -

Justice Advisory Council Meeting Minutes 2016

- - -

Justice Advisory Council Meeting Agendas 2016

- - -

Justice Advisory Council Meeting Agendas 2017

- - -

Justice Advisory Council Meeting Minutes 2016

- - -

Justice Advisory Council Meeting Minutes 2017

- - -

Justice Advisory Council Meeting Agenda 2018

- - -

Justice Advisory Council Meeting Minutes 2018

- - -

Justice Advisory Council Meeting Agenda 2019

- - -

Justice Advisory Council Minutes 2019

-
-
- -
- - - -
- - - -
- - - - - - - - - -

Original text


ShareThis Copy and Paste
\ No newline at end of file diff --git a/tests/test_cook_justice_advisory.py b/tests/test_cook_justice_advisory.py deleted file mode 100644 index f391c7e8c..000000000 --- a/tests/test_cook_justice_advisory.py +++ /dev/null @@ -1,82 +0,0 @@ -from datetime import datetime -from os.path import dirname, join - -from city_scrapers_core.constants import ADVISORY_COMMITTEE, PASSED -from city_scrapers_core.utils import file_response -from freezegun import freeze_time - -from city_scrapers.spiders.cook_justice_advisory import CookJusticeAdvisorySpider - -test_response = file_response( - join(dirname(__file__), "files", "cook_justice_advisory.html"), - url="https://www.cookcountyil.gov/event/jac-council-meeting-18", -) - -test_detail_response = file_response( - join(dirname(__file__), "files", "cook_justice_advisory_details.html"), - url=("https://www.cookcountyil.gov/service/justice-advisory-council-meetings"), -) - -spider = CookJusticeAdvisorySpider() -freezer = freeze_time("2020-6-12") -freezer.start() -spider._parse_links(test_detail_response) -item = spider._parse_event(test_response) -freezer.stop() - - -def test_title(): - assert item["title"] == "Justice Advisory Council" - - -def test_start(): - assert item["start"] == datetime(2020, 1, 9, 8, 30) - - -def test_end(): - assert item["end"] == datetime(2020, 1, 9, 10, 00) - - -def test_time_notes(): - assert item["time_notes"] == "" - - -def test_id(): - assert item["id"] == "cook_justice_advisory/202001090830/x/justice_advisory_council" - - -def test_all_day(): - assert item["all_day"] is False - - -def test_classification(): - assert item["classification"] == ADVISORY_COMMITTEE - - -def test_status(): - assert item["status"] == PASSED - - -def test_location(): - assert item["location"] == { - "name": "", - "address": "69 W. Washington 22nd Floor Conference Room D Chicago IL 60602", - } - - -def test_sources(): - assert item["source"] == "https://www.cookcountyil.gov/event/jac-council-meeting-18" - - -def test_description(): - assert item["description"] == "JAC Council Meeting" - - -def test_links(): - assert item["links"] == [ - { - "href": "https://www.cookcountyil.gov/sites/default/" - "files/jac_council_agenda_1.9.2020_1.pdf", - "title": "Agenda", - } - ]