From f1083863ea9980252edcc3e6250cb12d22204fc1 Mon Sep 17 00:00:00 2001 From: Jesse Mortenson Date: Thu, 12 Dec 2024 16:45:06 -0600 Subject: [PATCH] PA: attempt to fix duplicate vote events in scrape --- scrapers/pa/bills.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scrapers/pa/bills.py b/scrapers/pa/bills.py index f332af7776..fed1bd680a 100644 --- a/scrapers/pa/bills.py +++ b/scrapers/pa/bills.py @@ -38,13 +38,20 @@ def scrape_session(self, chamber, session, special=0): url = utils.bill_list_url(chamber, session, special) page = self.get_page(url) + # PA website repeats some bills on the listing page + # ex: resolutions that are also concurrent resolutions + bill_urls_seen = [] + RETRY_TIMES = 5 for link in page.xpath('//a[@class="bill"]'): is_parsed = False for retry_time in range(0, RETRY_TIMES): try: - yield from self.parse_bill(chamber, session, special, link) + if link.attrib["href"] not in bill_urls_seen: + bill_urls_seen.append(link.attrib["href"]) + yield from self.parse_bill(chamber, session, special, link) is_parsed = True + break except Exception as e: self.logger.warning(