From 4e5a263708a781db9fcf93778dd347fe54de2d74 Mon Sep 17 00:00:00 2001 From: braykuka Date: Mon, 9 Dec 2024 15:15:30 +0100 Subject: [PATCH 1/4] IL: Bill scraper rewrite to new beta site --- scrapers/il/__init__.py | 78 +++++++++++--------- scrapers/il/bills.py | 154 ++++++++++++++++++++-------------------- scrapers/il/events.py | 71 ++++++++++-------- 3 files changed, 165 insertions(+), 138 deletions(-) diff --git a/scrapers/il/__init__.py b/scrapers/il/__init__.py index ed5889785b..2d3aa32a76 100644 --- a/scrapers/il/__init__.py +++ b/scrapers/il/__init__.py @@ -1,4 +1,6 @@ # encoding=utf-8 +from urllib import response +import requests from utils import url_xpath from openstates.scrape import State from .bills import IlBillScraper @@ -6,16 +8,13 @@ class Illinois(State): - scrapers = { - "bills": IlBillScraper, - "events": IlEventScraper, - } + scrapers = {"bills": IlBillScraper, "events": IlEventScraper} legislative_sessions = [ { "name": "90th Regular Session", "identifier": "90th", "classification": "primary", - "_scraped_name": "90 (1997-1998)", + "_scraped_name": "90th General Assembly (1997-1998)", "start_date": "1997-01-08", "end_date": "1999-01-12", }, @@ -23,7 +22,7 @@ class Illinois(State): "name": "91st Regular Session", "identifier": "91st", "classification": "primary", - "_scraped_name": "91 (1999-2000)", + "_scraped_name": "91st General Assembly (1999-2000)", "start_date": "1999-01-13", "end_date": "2001-01-09", }, @@ -31,7 +30,7 @@ class Illinois(State): "name": "92nd Regular Session", "identifier": "92nd", "classification": "primary", - "_scraped_name": "92 (2001-2002)", + "_scraped_name": "92nd General Assembly (2001-2002)", "start_date": "2001-01-10", "end_date": "2003-01-07", }, @@ -39,7 +38,7 @@ class Illinois(State): "name": "93rd Regular Session", "identifier": "93rd", "classification": "primary", - "_scraped_name": "93 (2003-2004)", + "_scraped_name": "93rd General Assembly (2003-2004)", "start_date": "2003-01-08", "end_date": "2005-01-11", }, @@ -54,7 +53,7 @@ class Illinois(State): "name": "94th Regular Session", "identifier": "94th", "classification": "primary", - "_scraped_name": "94 (2005-2006)", + "_scraped_name": "94th General Assembly (2005-2006)", "start_date": "2005-01-12", "end_date": "2007-01-09", }, @@ -62,7 +61,7 @@ class Illinois(State): "name": "95th Regular Session", "identifier": "95th", "classification": "primary", - "_scraped_name": "95 (2007-2008)", + "_scraped_name": "95th General Assembly (2007-2008)", "start_date": "2007-01-10", "end_date": "2009-01-13", }, @@ -77,7 +76,7 @@ class Illinois(State): "name": "96th Regular Session", "identifier": "96th", "classification": "primary", - "_scraped_name": "96 (2009-2010)", + "_scraped_name": "96th General Assembly (2009-2010)", "start_date": "2009-01-14", "end_date": "2011-01-11", }, @@ -92,7 +91,7 @@ class Illinois(State): "name": "97th Regular Session", "identifier": "97th", "classification": "primary", - "_scraped_name": "97 (2011-2012)", + "_scraped_name": "97th General Assembly (2011-2012)", "start_date": "2011-01-12", "end_date": "2013-01-08", }, @@ -100,7 +99,7 @@ class Illinois(State): "name": "98th Regular Session", "identifier": "98th", "classification": "primary", - "_scraped_name": "98 (2013-2014)", + "_scraped_name": "98th General Assembly (2013-2014)", "start_date": "2013-01-09", "end_date": "2015-01-13", }, @@ -108,7 +107,7 @@ class Illinois(State): "name": "99th Regular Session", "identifier": "99th", "classification": "primary", - "_scraped_name": "99 (2015-2016)", + "_scraped_name": "99th General Assembly (2015-2016)", "start_date": "2015-01-14", "end_date": "2017-01-10", }, @@ -116,7 +115,7 @@ class Illinois(State): "name": "100th Special Session", "identifier": "100th-special", "classification": "special", - "_scraped_name": "100 (2017-2018)", + "_scraped_name": "100th General Assembly (2017-2018)", "start_date": "2017-06-21", "end_date": "2017-06-21", }, @@ -133,12 +132,12 @@ class Illinois(State): "start_date": "2019-01-09", "end_date": "2019-12-14", "classification": "primary", - "_scraped_name": "101 (2019-2020)", + "_scraped_name": "101st General Assembly (2019-2020)", }, # Leave this on until 2023-01-31, # IL has a history post-session governor actions { - "_scraped_name": "102 (2021-2022)", + "_scraped_name": "102nd General Assembly (2021-2022)", "name": "102nd Regular Session", "identifier": "102nd", "start_date": "2021-01-13", @@ -148,7 +147,16 @@ class Illinois(State): }, # check senate prez in session_details in bills.py # https://www.ilga.gov/house/schedules/2024_House_Spring_Session.pdf - { + # { + # "name": "103rd Regular Session", + # "identifier": "103rd", + # "start_date": "2023-01-11", + # "end_date": "2024-05-24", + # "classification": "primary", + # "active": False, + # }, + { + "_scraped_name": "103rd General Assembly (2023-2024)", "name": "103rd Regular Session", "identifier": "103rd", "start_date": "2023-01-11", @@ -159,20 +167,26 @@ class Illinois(State): ] ignored_scraped_sessions = [ - "77 (1971-1972)", - "78 (1973-1974)", - "79 (1975-1976)", - "80 (1977-1978)", - "81 (1979-1980)", - "82 (1981-1982)", - "83 (1983-1984)", - "84 (1985-1986)", - "85 (1987-1988)", - "86 (1989-1990)", - "87 (1991-1992)", - "88 (1993-1994)", - "89 (1995-1996)", + "89th General Assembly (1995-1996)", + "88th General Assembly (1993-1994)", + "87th General Assembly (1991-1992)", + "86th General Assembly (1989-1990)", + "85th General Assembly (1987-1988)", + "84th General Assembly (1985-1986)", + "83rd General Assembly (1983-1984)", + "82nd General Assembly (1981-1982)", + "81st General Assembly (1979-1980)", + "80th General Assembly (1977-1978)", + "79th General Assembly (1975-1976)", + "78th General Assembly (1973-1974)", + "77th General Assembly (1971-1972)", ] def get_session_list(self): - return url_xpath("https://ilga.gov/PreviousGA.asp", "//option/text()") + response = requests.get( + "https://beta.ilga.gov/API/Legislation/GetGeneralAssemblies" + ) + response.raise_for_status() + session_list = [ga["gaLabel"] for ga in response.json()] + + return session_list diff --git a/scrapers/il/bills.py b/scrapers/il/bills.py index b45d261ed1..926dced675 100644 --- a/scrapers/il/bills.py +++ b/scrapers/il/bills.py @@ -279,19 +279,22 @@ def chamber_slug(chamber): class IlBillScraper(Scraper): - LEGISLATION_URL = "https://ilga.gov/legislation/grplist.asp" + LEGISLATION_URL = "https://beta.ilga.gov/Legislation/" localize = pytz.timezone("America/Chicago").localize def get_bill_urls(self, chamber, session, doc_type): params = session_details[session]["params"] - params["num1"] = "1" - params["num2"] = "10000" - params["DocTypeID"] = doc_type - html = self.get(self.LEGISLATION_URL, params=params).text + url = "https://beta.ilga.gov/Legislation/RegularSession/{}?SessionId={}".format( + doc_type, + params["SessionId"], + ) + html = self.get(url).text doc = lxml.html.fromstring(html) - doc.make_links_absolute(self.LEGISLATION_URL) + doc.make_links_absolute(url) - for bill_url in doc.xpath("//li/a/@href"): + for bill_url in doc.xpath( + '//div[@id="div_0001"]//table//td[1]/a[contains(@href, "DocNum=")]/@href' + ): yield bill_url def scrape(self, session=None): @@ -321,16 +324,9 @@ def scrape(self, session=None): chamber, session_id, "AM", bill_url, "appointment" ) - # TODO: get joint session resolution added to python-opencivicdata - # for bill_url in self.get_bill_urls(chamber, session_id, 'JSR'): - # bill, votes = self.scrape_bill(chamber, session_id, 'JSR', bill_url, - # 'joint session resolution') - # yield bill - # yield from votes - def scrape_archive_bills(self, session): session_abr = session[0:2] - url = f"https://www.ilga.gov/legislation/legisnet{session_abr}/{session_abr}gatoc.html" + url = f"https://beta.ilga.gov/documents/legislation/legisnet{session_abr}/{session_abr}gatoc.html" html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) @@ -338,6 +334,7 @@ def scrape_archive_bills(self, session): # Contains multiple bills for bill_numbers_section_url in bill_numbers_sections: + bill_numbers_section_url = clean_archivebill_url(bill_numbers_section_url) bill_section_html = self.get(bill_numbers_section_url).text bill_section_doc = lxml.html.fromstring(bill_section_html) bill_section_doc.make_links_absolute(bill_numbers_section_url) @@ -351,6 +348,7 @@ def scrape_archive_bills(self, session): # Actual Bill Pages for bill_url in bills_urls: + bill_url = clean_archivebill_url(bill_url) bill_html = self.get(bill_url).text bill_doc = lxml.html.fromstring(bill_html) @@ -377,6 +375,7 @@ def scrape_archive_bills(self, session): summary_page_url = bill_doc.xpath( '//a[contains (., "Bill Summary")]/@href' )[0] + summary_page_url = clean_archivebill_url(summary_page_url) summary_page_html = self.get(summary_page_url).text summary_page_doc = lxml.html.fromstring(summary_page_html) summary_page_doc.make_links_absolute(summary_page_url) @@ -387,6 +386,7 @@ def scrape_archive_bills(self, session): bill_url = bill_doc.xpath('//a[contains (., "Bill Status")]/@href')[ 0 ] + bill_url = clean_archivebill_url(bill_url) bill_html = self.get(bill_url).text bill_doc = lxml.html.fromstring(bill_html) bill_doc.make_links_absolute(bill_url) @@ -421,6 +421,7 @@ def scrape_archive_bills(self, session): # Bill version version_url = bill_doc.xpath('//a[contains (., "Full Text")]/@href')[0] + version_url = clean_archivebill_url(version_url) bill.add_version_link(bill_id, version_url, media_type="text/html") # Actions @@ -483,19 +484,15 @@ def scrape_bill(self, chamber, session, doc_type, url, bill_type=None): bill_type = bill_type or DOC_TYPES[doc_type[1:]] bill_id = doc_type + bill_num - title = doc.xpath( - '//span[text()="Short Description:"]/following-sibling::span[1]/' "text()" - )[0].strip() + title = doc.xpath('//div[@id="content"]/div[1]/div/h5/text()')[0].strip() # 1. Find the heading with "Synopsis As Introduced" for text. # 2. Go to the next heading. # 3. Backtrack and grab everything to, but not including, #1. # 4. Grab text of all, including nested, nodes. - summary_nodes = doc.xpath( - '//span[text()="Synopsis As Introduced"]/following-sibling::span[contains(@class, "heading2")]/' - 'preceding-sibling::*[preceding-sibling::span[text()="Synopsis As Introduced"]]//' + summary = doc.xpath( + '//h5[text()="Synopsis As Introduced"]/../div[@class="list-group"]/span/' "text()" - ) - summary = "\n".join([node.strip() for node in summary_nodes]) + )[0].strip() bill = Bill( identifier=bill_id, @@ -509,14 +506,15 @@ def scrape_bill(self, chamber, session, doc_type, url, bill_type=None): bill.add_source(url) # sponsors - sponsor_list = build_sponsor_list(doc.xpath('//a[contains(@class, "content")]')) + sponsor_list = build_sponsor_list( + doc.xpath('//div[@id="sponsorDiv"]//a[@class="notranslate"]') + ) # don't add just yet; we can make them better using action data - # actions - action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td') + action_tds = doc.xpath('//h5[text()="Actions"]/../table//td') for date, actor, action_elem in group(action_tds, 3): date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") - date = self.localize(date).date() + date = date.date() actor = actor.text_content() actor_id = "upper" if actor == "Senate" else "lower" @@ -581,56 +579,54 @@ def scrape_documents(self, bill, version_url): if "HTML full text does not exist for this appropriations document" in html: pdf_only = True - for link in doc.xpath('//a[contains(@href, "fulltext")]'): - name = link.text + for link in doc.xpath( + '//div[@id="content"]/div[contains(@class, "row")]//a[contains(@class, "content")]' + ): + name = link.text_content().strip() url = link.get("href") - # Ignore the "Printer-friendly version" link # That link is a "latest version" alias for an actual, distinct version - if "print=true" not in url: - if name in VERSION_TYPES or "amendment" in name.lower(): - if pdf_only: - # eed to visit the version's page, and get PDF link from there - # otherwise get a faulty "latest version"/"LV" alias/duplicate - version_page_html = self.get(url).text - version_page_doc = lxml.html.fromstring(version_page_html) - version_page_doc.make_links_absolute(url) - pdf_link = version_page_doc.xpath('//a[text()="PDF"]')[0] - url = pdf_link.get("href") - mimetype = "application/pdf" - else: - url = "{}&print=true".format(url) - mimetype = "text/html" - - version_id = re.search( - r"DocName=(.*?)&", url, flags=re.IGNORECASE - ).group(1) - doctype = re.search( - r"DocTypeId=(.*?)&", url, flags=re.IGNORECASE - ).group(1) - # numeric component of the session id - session_number = int( - "".join( - char - for char in bill.legislative_session - if char.isdigit() - ) - ) - - # if it's html, extract the pdf link too while we're here. - pdf_url = f"https://ilga.gov/legislation/{session_number}/{doctype}/PDF/{version_id}.pdf" - bill.add_version_link( - name, pdf_url, media_type="application/pdf" - ) - - bill.add_version_link(name, url, media_type=mimetype) - elif name in FULLTEXT_DOCUMENT_TYPES: - bill.add_document_link(name, url) - elif "Printer-Friendly" in name: - pass + if name in VERSION_TYPES or "amendment" in name.lower(): + if pdf_only: + # eed to visit the version's page, and get PDF link from there + # otherwise get a faulty "latest version"/"LV" alias/duplicate + url = "{}&Print=1".format(url) + version_page_html = self.get(url).text + version_page_doc = lxml.html.fromstring(version_page_html) + version_page_doc.make_links_absolute(url) + pdf_link = version_page_doc.xpath('//a[contains(@href, "PDF")]') + if not pdf_link: + continue + pdf_link = pdf_link[0] + url = pdf_link.get("href") + mimetype = "application/pdf" else: - self.warning("unknown document type %s - adding as document" % name) - bill.add_document_link(name, url) + url = "{}&Print=1".format(url) + mimetype = "text/html" + version_id = re.search( + r"DocName=(.*?)&", url, flags=re.IGNORECASE + ).group(1) + doctype = re.search( + r"DocTypeId=(.*?)&", url, flags=re.IGNORECASE + ).group(1) + # numeric component of the session id + session_number = int( + "".join( + char for char in bill.legislative_session if char.isdigit() + ) + ) + # if it's html, extract the pdf link too while we're here. + pdf_url = f"https://beta.ilga.gov/documents/legislation/{session_number}/{doctype}/PDF/{version_id}.pdf" + bill.add_version_link(name, pdf_url, media_type="application/pdf") + + bill.add_version_link(name, url, media_type=mimetype) + elif name in FULLTEXT_DOCUMENT_TYPES: + bill.add_document_link(name, url) + elif "Printer-Friendly" in name: + pass + else: + self.warning("unknown document type %s - adding as document" % name) + bill.add_document_link(name, url) def scrape_votes(self, session, bill, votes_url): html = self.get(votes_url).text @@ -644,7 +640,7 @@ def scrape_votes(self, session, bill, votes_url): pieces = link.text.split(" - ") date = pieces[-1] - vote_type = link.xpath("../ancestor::table[1]//td[1]/text()")[0] + vote_type = link.xpath("../a/text()")[0] if vote_type == "Committee Hearing Votes": chamber = link.xpath("../following-sibling::td/text()")[0] actor = "upper" if chamber == "SENATE" else "lower" @@ -915,9 +911,9 @@ def build_sponsor_list(sponsor_atags): spontype = "cosponsor" for atag in sponsor_atags: sponsor = atag.text - if "house" in atag.attrib["href"].split("/"): + if "house" in atag.attrib["href"].lower().split("/"): chamber = "lower" - elif "senate" in atag.attrib["href"].split("/"): + elif "senate" in atag.attrib["href"].lower().split("/"): chamber = "upper" else: chamber = None @@ -934,3 +930,9 @@ def build_sponsor_list(sponsor_atags): official_spontype = "cosponsor" # until replaced sponsors.append((spontype, sponsor, chamber, official_spontype)) return sponsors + + +def clean_archivebill_url(url): + if "https://beta.ilga.gov/documents/" not in url: + url = url.replace("https://beta.ilga.gov/", "https://beta.ilga.gov/documents/") + return url diff --git a/scrapers/il/events.py b/scrapers/il/events.py index 6d90977beb..be949c2da3 100644 --- a/scrapers/il/events.py +++ b/scrapers/il/events.py @@ -1,4 +1,6 @@ +from calendar import month import datetime as dt +import json import lxml import re @@ -8,8 +10,8 @@ import pytz urls = { - "upper": "https://www.ilga.gov/senate/schedules/weeklyhearings.asp", - "lower": "https://www.ilga.gov/house/schedules/weeklyhearings.asp", + "upper": "https://beta.ilga.gov/Senate/Schedules", + "lower": "https://beta.ilga.gov/House/Schedules", } chamber_names = { @@ -34,12 +36,16 @@ def scrape_page(self, url, chamber): doc = lxml.html.fromstring(html) doc.make_links_absolute(url) - ctty_name = doc.xpath("//span[@class='heading']")[0].text_content() + ctty_name = doc.xpath('//*[@id="main-content"]/section[2]//h2')[ + 0 + ].text_content() # Remove prefixes from the name like "Hearing notice for" ctty_name = ctty_name_re.match(ctty_name).group(4) - tables = doc.xpath("//table[@cellpadding='3']") + tables = doc.xpath( + '//div[contains(@class, "card")][.//h4[contains(., "Hearing Details")]]//table' + ) if not tables: self.warning(f"Empty hearing data for {url}") return False, False @@ -47,12 +53,11 @@ def scrape_page(self, url, chamber): rows = info.xpath(".//tr") metainf = {} for row in rows: - tds = row.xpath(".//td") - key = tds[0].text_content().strip() - value = tds[1].text_content().strip() + tds = "".join(row.xpath(".//td//text()")).split(":") + key = tds[0].strip() + value = ":".join(tds[1:]).strip() metainf[key] = value - - where = metainf["Location:"] + where = metainf["Location"] description = f"{chamber} {ctty_name}" # Remove committee suffix from names @@ -64,12 +69,13 @@ def scrape_page(self, url, chamber): descr_parts = description.split("-") description = " - ".join([x.strip() for x in descr_parts]) - datetime = metainf["Scheduled Date:"] + datetime = metainf["Date"] datetime = re.sub(r"\s+", " ", datetime) repl = {"AM": " AM", "PM": " PM"} # Space shim. for r in repl: datetime = datetime.replace(r, repl[r]) - datetime = self.localize(dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")) + # datetime = self.localize(dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")) + datetime = self.localize(dt.datetime.strptime(datetime, "%m/%d/%Y %I:%M %p")) event_name = f"{description}#{where}#{datetime}" event = Event(description, start_date=datetime, location_name=where) @@ -78,20 +84,24 @@ def scrape_page(self, url, chamber): event.add_participant(ctty_name, "organization") - bills = tables[1] - for bill in bills.xpath(".//tr")[1:]: - tds = bill.xpath(".//td") - if len(tds) < 4: - continue - # First, let's get the bill ID: - bill_id = tds[0].text_content() - - # Apply correct spacing to bill id - (alpha, num) = bill_re.match(bill_id).groups() - bill_id = f"{alpha} {num}" - - agenda_item = event.add_agenda_item(bill_id) - agenda_item.add_bill(bill_id) + bills = doc.xpath( + '//div[contains(@class, "card")][.//h4[contains(., "Bills Assigned To Hearing")]]//table' + ) + if bills: + bills = bills[0] + for bill in bills.xpath(".//tr")[1:]: + tds = bill.xpath(".//td") + if len(tds) < 4: + continue + # First, let's get the bill ID: + bill_id = tds[0].text_content() + + # Apply correct spacing to bill id + (alpha, num) = bill_re.match(bill_id).groups() + bill_id = f"{alpha} {num}" + + agenda_item = event.add_agenda_item(bill_id) + agenda_item.add_bill(bill_id) return event, event_name @@ -112,14 +122,15 @@ def scrape(self): no_scheduled_ct += 1 continue - tables = doc.xpath("//table[@width='550']") + tables = doc.xpath('//*[@id="pane-Week"]//table//tr') events = set() for table in tables: - meetings = table.xpath(".//a") + meetings = table.xpath(".//button") for meeting in meetings: - event, name = self.scrape_page( - meeting.attrib["href"], chamber_names[chamber] - ) + meeting_url = "https://beta.ilga.gov" + meeting.attrib[ + "onclick" + ].replace("location.href=", "").strip("'. ") + event, name = self.scrape_page(meeting_url, chamber_names[chamber]) if event and name: if name in events: self.warning(f"Duplicate event {name}") From 5279c5c7ee5138d0eb622f0beac3e3b4e7ec6496 Mon Sep 17 00:00:00 2001 From: braykuka Date: Mon, 9 Dec 2024 15:34:53 +0100 Subject: [PATCH 2/4] fix: lint issue --- scrapers/il/__init__.py | 2 -- scrapers/il/events.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/scrapers/il/__init__.py b/scrapers/il/__init__.py index 2d3aa32a76..0d5cd5f278 100644 --- a/scrapers/il/__init__.py +++ b/scrapers/il/__init__.py @@ -1,7 +1,5 @@ # encoding=utf-8 -from urllib import response import requests -from utils import url_xpath from openstates.scrape import State from .bills import IlBillScraper from .events import IlEventScraper diff --git a/scrapers/il/events.py b/scrapers/il/events.py index be949c2da3..22b36db838 100644 --- a/scrapers/il/events.py +++ b/scrapers/il/events.py @@ -1,6 +1,4 @@ -from calendar import month import datetime as dt -import json import lxml import re From e0190f88d567dcbe3668c54d9fb3912b94c0f947 Mon Sep 17 00:00:00 2001 From: braykuka Date: Tue, 10 Dec 2024 21:14:33 +0100 Subject: [PATCH 3/4] Fix small issue --- scrapers/il/bills.py | 14 ++++++++------ scrapers/il/events.py | 12 ++++++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/scrapers/il/bills.py b/scrapers/il/bills.py index 926dced675..c2168f500c 100644 --- a/scrapers/il/bills.py +++ b/scrapers/il/bills.py @@ -9,6 +9,7 @@ from openstates.utils import convert_pdf +BASE_URL = "https://beta.ilga.gov" central = pytz.timezone("US/Central") @@ -239,7 +240,7 @@ } DUPE_VOTES = { - "https://ilga.gov/legislation/votehistory/100/house/committeevotes/" + f"{BASE_URL}/legislation/votehistory/100/house/committeevotes/" "10000HB2457_16401.pdf" } @@ -279,12 +280,13 @@ def chamber_slug(chamber): class IlBillScraper(Scraper): - LEGISLATION_URL = "https://beta.ilga.gov/Legislation/" + LEGISLATION_URL = f"{BASE_URL}/Legislation/" localize = pytz.timezone("America/Chicago").localize def get_bill_urls(self, chamber, session, doc_type): params = session_details[session]["params"] - url = "https://beta.ilga.gov/Legislation/RegularSession/{}?SessionId={}".format( + url = "{}/Legislation/RegularSession/{}?SessionId={}".format( + BASE_URL, doc_type, params["SessionId"], ) @@ -293,7 +295,7 @@ def get_bill_urls(self, chamber, session, doc_type): doc.make_links_absolute(url) for bill_url in doc.xpath( - '//div[@id="div_0001"]//table//td[1]/a[contains(@href, "DocNum=")]/@href' + '//div[contains(@id,"div_")]//table//td[1]/a[contains(@href, "DocNum=")]/@href' ): yield bill_url @@ -326,7 +328,7 @@ def scrape(self, session=None): def scrape_archive_bills(self, session): session_abr = session[0:2] - url = f"https://beta.ilga.gov/documents/legislation/legisnet{session_abr}/{session_abr}gatoc.html" + url = f"{BASE_URL}/documents/legislation/legisnet{session_abr}/{session_abr}gatoc.html" html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) @@ -616,7 +618,7 @@ def scrape_documents(self, bill, version_url): ) ) # if it's html, extract the pdf link too while we're here. - pdf_url = f"https://beta.ilga.gov/documents/legislation/{session_number}/{doctype}/PDF/{version_id}.pdf" + pdf_url = f"{BASE_URL}/documents/legislation/{session_number}/{doctype}/PDF/{version_id}.pdf" bill.add_version_link(name, pdf_url, media_type="application/pdf") bill.add_version_link(name, url, media_type=mimetype) diff --git a/scrapers/il/events.py b/scrapers/il/events.py index 22b36db838..4820f30b06 100644 --- a/scrapers/il/events.py +++ b/scrapers/il/events.py @@ -7,9 +7,10 @@ import pytz +BASE_URL = "https://beta.ilga.gov" urls = { - "upper": "https://beta.ilga.gov/Senate/Schedules", - "lower": "https://beta.ilga.gov/House/Schedules", + "upper": f"{BASE_URL}/Senate/Schedules", + "lower": f"{BASE_URL}/House/Schedules", } chamber_names = { @@ -72,7 +73,6 @@ def scrape_page(self, url, chamber): repl = {"AM": " AM", "PM": " PM"} # Space shim. for r in repl: datetime = datetime.replace(r, repl[r]) - # datetime = self.localize(dt.datetime.strptime(datetime, "%b %d, %Y %I:%M %p")) datetime = self.localize(dt.datetime.strptime(datetime, "%m/%d/%Y %I:%M %p")) event_name = f"{description}#{where}#{datetime}" @@ -125,9 +125,9 @@ def scrape(self): for table in tables: meetings = table.xpath(".//button") for meeting in meetings: - meeting_url = "https://beta.ilga.gov" + meeting.attrib[ - "onclick" - ].replace("location.href=", "").strip("'. ") + meeting_url = BASE_URL + meeting.attrib["onclick"].replace( + "location.href=", "" + ).strip("'. ") event, name = self.scrape_page(meeting_url, chamber_names[chamber]) if event and name: if name in events: From 6437443edffadf062492146a51fbc80a80abaec0 Mon Sep 17 00:00:00 2001 From: braykuka Date: Fri, 13 Dec 2024 16:49:48 +0100 Subject: [PATCH 4/4] fix: no abstract issue --- scrapers/il/bills.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scrapers/il/bills.py b/scrapers/il/bills.py index c2168f500c..916c0c3fad 100644 --- a/scrapers/il/bills.py +++ b/scrapers/il/bills.py @@ -504,7 +504,8 @@ def scrape_bill(self, chamber, session, doc_type, url, bill_type=None): chamber=chamber, ) - bill.add_abstract(summary, note="") + if summary: + bill.add_abstract(summary, note="") bill.add_source(url) # sponsors