From 259266d9e9d477c5e4283f4a07c4d92ec8578266 Mon Sep 17 00:00:00 2001 From: Daniel Simmons-Ritchie <37225902+SimmonsRitchie@users.noreply.github.com> Date: Thu, 14 Dec 2023 13:14:22 -0600 Subject: [PATCH 1/7] Fix spider Fixes URL and HTML selectors --- city_scrapers/spiders/il_commerce.py | 64 +++++++++++++++------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/city_scrapers/spiders/il_commerce.py b/city_scrapers/spiders/il_commerce.py index 329767fc6..af5f62468 100644 --- a/city_scrapers/spiders/il_commerce.py +++ b/city_scrapers/spiders/il_commerce.py @@ -11,7 +11,7 @@ class IlCommerceSpider(CityScrapersSpider): agency = "Illinois Commerce Commission" timezone = "America/Chicago" start_urls = [ - "https://www.icc.illinois.gov/meetings/default.aspx?dts=32&et=1&et=5&et=3" + "https://www.icc.illinois.gov/meetings?bd=638381088000000000&dts=32&scm=True&sps=True&sh=True&sjc=True&ssh=False&smceb=True" ] def parse(self, response): @@ -21,30 +21,26 @@ def parse(self, response): Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping needs. """ - for nav_link in response.css(".col-sm-7 a.btn"): - if "?bd=" in nav_link.attrib["href"]: - yield response.follow( - nav_link.attrib["href"], callback=self._parse_events_page - ) - - yield from self._parse_events_page(response) + event_links = response.css(".p-2 a.day") + for event_link in event_links: + href = event_link.attrib["href"] + yield response.follow( + href, callback=self._parse_events_page + ) def _parse_events_page(self, response): - for item in response.css(".panel-body a"): - yield response.follow(item.attrib["href"], callback=self._parse_detail) - - def _parse_detail(self, response): + panel = response.css(".soi-icc-container .col-12") title = self._parse_title(response) meeting = Meeting( title=title, - description=self._parse_description(response), + description=self._parse_description(panel), classification=self._parse_classification(title), - start=self._parse_start(response), + start=self._parse_start(panel), end=None, all_day=False, time_notes="", - location=self._parse_location(response), - links=self._parse_links(response), + location=self._parse_location(panel), + links=self._parse_links(panel, response), source=response.url, ) @@ -55,10 +51,10 @@ def _parse_detail(self, response): yield meeting - def _parse_title(self, response): + def _parse_title(self, selector): """Parse or generate meeting title.""" title_str = re.sub( - r"\s+", " ", " ".join(response.css(".soi-container h2 *::text").extract()) + r"\s+", " ", " ".join(selector.css(".soi-container h2 *::text").extract()) ).strip() return re.sub( r"(Illinois Commerce Commission|(?=Committee )Committee Meeting$)", @@ -66,10 +62,10 @@ def _parse_title(self, response): title_str, ).strip() - def _parse_description(self, response): + def _parse_description(self, selector): """Parse or generate meeting description.""" return re.sub( - r"\s+", " ", " ".join(response.css(".col-sm-12 > p *::text").extract()) + r"\s+", " ", " ".join(selector.css(".mt-4+ p *::text").extract()) ).strip() def _parse_classification(self, title): @@ -80,18 +76,23 @@ def _parse_classification(self, title): return COMMITTEE return COMMISSION - def _parse_start(self, response): + def _parse_start(self, selector): """Parse start datetime as a naive datetime object.""" - start_str = " ".join(response.css("h3.mt-4 *::text").extract()) + start_str = " ".join(selector.css("h3.mt-4 *::text").extract()) dt_str = re.search( r"[A-Z][a-z]{2,8} \d{1,2}, \d{4} \d{1,2}:\d{2} [APM]{2}", start_str ).group() return datetime.strptime(dt_str, "%B %d, %Y %I:%M %p") - def _parse_location(self, response): + def _parse_location(self, selector): """Parse or generate location.""" - location_block = response.css(".row.mt-4 > .col-12")[0] - location_items = location_block.css("p *::text").extract() + location_block = selector.css(".row.mt-4 > .col-12") + if len(location_block) == 0: + return { + "address": "", + "name": "TBD", + } + location_items = location_block[0].css("p *::text").extract() addr_items = [ i.strip() for i in location_items if "Building" not in i and i.strip() ] @@ -103,14 +104,17 @@ def _parse_location(self, response): "name": " ".join(name_items), } - def _parse_links(self, response): + def _parse_links(self, selector, response): """Parse or generate links.""" - links = [] - for link in response.css(".row.mt-4 .list-unstyled a"): - links.append( + links = selector.css(".row.mt-4 .list-unstyled a") + urls = [] + if not links: + return urls + for link in links: + urls.append( { "title": " ".join(link.css("*::text").extract()).strip(), "href": response.urljoin(link.attrib["href"]), } ) - return links + return urls From 9a6d19354f25ca929a333d3e5a04c86d94c70768 Mon Sep 17 00:00:00 2001 From: Daniel Simmons-Ritchie <37225902+SimmonsRitchie@users.noreply.github.com> Date: Thu, 14 Dec 2023 16:42:23 -0600 Subject: [PATCH 2/7] Fix status field --- city_scrapers/spiders/il_commerce.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/city_scrapers/spiders/il_commerce.py b/city_scrapers/spiders/il_commerce.py index af5f62468..e2d7860de 100644 --- a/city_scrapers/spiders/il_commerce.py +++ b/city_scrapers/spiders/il_commerce.py @@ -43,10 +43,8 @@ def _parse_events_page(self, response): links=self._parse_links(panel, response), source=response.url, ) - - meeting["status"] = self._get_status( - meeting, text=" ".join(response.css(".col-sm-12 *::text").extract()) - ) + status_str = " ".join(response.css("h3 *::text").extract()) + meeting["status"] = self._get_status(meeting, text=status_str) meeting["id"] = self._get_id(meeting) yield meeting From 5eb0822a6e9f04399668366af4aa9ff83ff8c79f Mon Sep 17 00:00:00 2001 From: Daniel Simmons-Ritchie <37225902+SimmonsRitchie@users.noreply.github.com> Date: Thu, 14 Dec 2023 16:59:24 -0600 Subject: [PATCH 3/7] Streamline URL --- city_scrapers/spiders/il_commerce.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/city_scrapers/spiders/il_commerce.py b/city_scrapers/spiders/il_commerce.py index e2d7860de..ee627dd46 100644 --- a/city_scrapers/spiders/il_commerce.py +++ b/city_scrapers/spiders/il_commerce.py @@ -11,7 +11,8 @@ class IlCommerceSpider(CityScrapersSpider): agency = "Illinois Commerce Commission" timezone = "America/Chicago" start_urls = [ - "https://www.icc.illinois.gov/meetings?bd=638381088000000000&dts=32&scm=True&sps=True&sh=True&sjc=True&ssh=False&smceb=True" + # Returns a page with 32 days of meetings from today's date, including today. + "https://www.icc.illinois.gov/meetings?dts=32&scm=True&sps=True&sh=True&sjc=True&ssh=False&smceb=True" ] def parse(self, response): From 576f94589cf721943f90dd7eb8fbcfcf6f542d3d Mon Sep 17 00:00:00 2001 From: Daniel Simmons-Ritchie <37225902+SimmonsRitchie@users.noreply.github.com> Date: Sun, 17 Dec 2023 09:20:30 -0600 Subject: [PATCH 4/7] Tweak method nmae --- city_scrapers/spiders/il_commerce.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/city_scrapers/spiders/il_commerce.py b/city_scrapers/spiders/il_commerce.py index ee627dd46..5bffdd187 100644 --- a/city_scrapers/spiders/il_commerce.py +++ b/city_scrapers/spiders/il_commerce.py @@ -26,10 +26,10 @@ def parse(self, response): for event_link in event_links: href = event_link.attrib["href"] yield response.follow( - href, callback=self._parse_events_page + href, callback=self._parse_event_page ) - def _parse_events_page(self, response): + def _parse_event_page(self, response): panel = response.css(".soi-icc-container .col-12") title = self._parse_title(response) meeting = Meeting( From c62f23bb506a107e88a45a13c74a6b78ebfb61e1 Mon Sep 17 00:00:00 2001 From: Daniel Simmons-Ritchie <37225902+SimmonsRitchie@users.noreply.github.com> Date: Sun, 17 Dec 2023 13:11:33 -0800 Subject: [PATCH 5/7] Update with new fixtures and assertions --- tests/files/il_commerce.html | 1670 ++++++++++++++++++++++----- tests/files/il_commerce_detail.html | 415 ++++--- tests/test_il_commerce.py | 57 +- 3 files changed, 1650 insertions(+), 492 deletions(-) diff --git a/tests/files/il_commerce.html b/tests/files/il_commerce.html index f60be189d..a22a8975a 100644 --- a/tests/files/il_commerce.html +++ b/tests/files/il_commerce.html @@ -1,159 +1,241 @@ + +
+ + +