-
-
Notifications
You must be signed in to change notification settings - Fork 310
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Build out stub of playwright-based spider
Navigates the JS-constructed DOM and extracts key text from elements.
- Loading branch information
1 parent
bb3b68e
commit a5f6023
Showing
2 changed files
with
68 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import scrapy | ||
from scrapy_playwright.page import PageMethod | ||
from city_scrapers_core.items import Meeting | ||
from city_scrapers_core.spiders import CityScrapersSpider | ||
|
||
class IlCriminalJusticeInformationSpider(CityScrapersSpider): | ||
name = "il_criminal_justice_information2" | ||
agency = "Illinois Criminal Justice Information Authority" | ||
timezone = "America/Chicago" | ||
location = { | ||
"name": "Illinois Criminal Justice Information Authority", | ||
"address": "300 W Adams St, Suite 200, Chicago, IL 60606", | ||
} | ||
|
||
def start_requests(self): | ||
url = "https://icjia.illinois.gov/news/meetings/" | ||
yield scrapy.Request(url, meta={ | ||
'playwright': True, | ||
'playwright_include_page': True, | ||
'errback': self.errback, | ||
'playwright_page_methods': [ | ||
PageMethod('wait_for_selector', '.v-data-table__wrapper > table > tbody'), | ||
], | ||
|
||
}) | ||
|
||
async def parse(self, response): | ||
""". | ||
`parse` should always `yield` Meeting items. | ||
Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping | ||
needs. | ||
""" | ||
page = response.meta["playwright_page"] | ||
|
||
rows = await page.query_selector_all('table > tbody > tr') | ||
row_count = len(rows) | ||
for row_no in range(1, row_count + 1): | ||
row_el = await page.query_selector(f'table > tbody > tr:nth-child({row_no})') | ||
print("row no.", row_no) | ||
title_element = await row_el.query_selector('td:nth-child(4) > div') | ||
if title_element: | ||
title_text = await title_element.inner_text() | ||
print("Title:", title_text) | ||
await row_el.click() | ||
detail_row_el = await page.query_selector(".v-data-table__expanded.v-data-table__expanded__content > td > div") | ||
if detail_row_el: | ||
detail_row_text = await detail_row_el.inner_text() | ||
print("Detail:", detail_row_text) | ||
|
||
await page.close() | ||
|
||
|
||
async def errback(self, failure): | ||
page = failure.request.meta["playwright_page"] | ||
await page.close() |