Build out stub of playwright-based spider

Navigates the JS-constructed DOM and extracts key text from elements.
City-Bureau · Jan 4, 2024 · a5f6023 · a5f6023
1 parent bb3b68e
commit a5f6023
Show file tree

Hide file tree

Showing 2 changed files with 68 additions and 0 deletions.
diff --git a/city_scrapers/settings/base.py b/city_scrapers/settings/base.py
@@ -46,6 +46,8 @@
     "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": 543,
 }
 
+
+
 COMMANDS_MODULE = "city_scrapers_core.commands"
 
 EXTENSIONS = {
@@ -55,3 +57,13 @@
 CLOSESPIDER_ERRORCOUNT = 5
 
 logging.getLogger("pdfminer").propagate = False
+
+# scrapy-playwright settings
+PLAYWRIGHT_BROWSER_TYPE = "firefox"
+
+DOWNLOAD_HANDLERS = {
+    "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+    "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+}
+
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
diff --git a/city_scrapers/spiders/il_criminal_justice_information2.py b/city_scrapers/spiders/il_criminal_justice_information2.py
@@ -0,0 +1,56 @@
+import scrapy
+from scrapy_playwright.page import PageMethod
+from city_scrapers_core.items import Meeting
+from city_scrapers_core.spiders import CityScrapersSpider
+
+class IlCriminalJusticeInformationSpider(CityScrapersSpider):
+    name = "il_criminal_justice_information2"
+    agency = "Illinois Criminal Justice Information Authority"
+    timezone = "America/Chicago"
+    location = {
+        "name": "Illinois Criminal Justice Information Authority",
+        "address": "300 W Adams St, Suite 200, Chicago, IL 60606",
+    }
+
+    def start_requests(self):
+        url = "https://icjia.illinois.gov/news/meetings/"
+        yield scrapy.Request(url, meta={
+                'playwright': True,
+                'playwright_include_page': True,
+                'errback': self.errback,
+                'playwright_page_methods': [
+                        PageMethod('wait_for_selector', '.v-data-table__wrapper > table > tbody'),
+                    ],
+
+            })
+
+    async def parse(self, response):
+        """.
+        `parse` should always `yield` Meeting items.
+
+        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
+        needs.
+        """
+        page = response.meta["playwright_page"]
+
+        rows = await page.query_selector_all('table > tbody > tr')
+        row_count = len(rows)
+        for row_no in range(1, row_count + 1):
+            row_el = await page.query_selector(f'table > tbody > tr:nth-child({row_no})')
+            print("row no.", row_no)
+            title_element = await row_el.query_selector('td:nth-child(4) > div')
+            if title_element:
+                title_text = await title_element.inner_text()
+                print("Title:", title_text)
+            await row_el.click()
+            detail_row_el = await page.query_selector(".v-data-table__expanded.v-data-table__expanded__content > td > div")
+            if detail_row_el:
+                detail_row_text = await detail_row_el.inner_text()
+                print("Detail:", detail_row_text)
+
+        await page.close()
+
+
+    async def errback(self, failure):
+        page = failure.request.meta["playwright_page"]
+        await page.close()