diff --git a/scrapers/al/bills.py b/scrapers/al/bills.py index 03346c8b1a..7360fafe72 100644 --- a/scrapers/al/bills.py +++ b/scrapers/al/bills.py @@ -6,6 +6,7 @@ import dateutil import requests from openstates.scrape import Scraper, Bill, VoteEvent +from openstates.exceptions import EmptyScrape from utils.media import get_media_type from .actions import Categorizer @@ -22,6 +23,7 @@ class ALBillScraper(Scraper): session_type = "" bill_ids = set() vote_keys = set() + count = 0 gql_headers = { "Accept": "*/*", @@ -40,13 +42,16 @@ def scrape(self, session): for bill_type in ["B", "R"]: yield from self.scrape_bill_type(session, bill_type, 0, 50) + if self.count == 0: + raise EmptyScrape + def scrape_bill_type(self, session: str, bill_type: str, offset: int, limit: int): - self.info(f"Scraping offset {offset} limit {limit}") + self.info(f"Scraping {bill_type} offset {offset} limit {limit}") json_data = { "query": "query bills($googleId: String, $category: String, $sessionYear: String, $sessionType: String, $direction: String, $orderBy: String, $offset: Int, $limit: Int, $filters: InstrumentOverviewInput! = {}, $search: String, $instrumentType: String) {\n allInstrumentOverviews(\n googleId: $googleId\n category: $category\n instrumentType: $instrumentType\n sessionYear: $sessionYear\n sessionType: $sessionType\n direction: $direction\n orderBy: $orderBy\n limit: $limit\n offset: $offset\n customFilters: $filters\n search: $search\n ) {\n ID\n SessionYear\n InstrumentNbr\n InstrumentSponsor\n SessionType\n Body\n Subject\n ShortTitle\n AssignedCommittee\n PrefiledDate\n FirstRead\n CurrentStatus\n LastAction\n LastActionDate\n ActSummary\n ViewEnacted\n CompanionInstrumentNbr\n EffectiveDateCertain\n EffectiveDateOther\n InstrumentType\n InstrumentUrl\n IntroducedUrl\n EngrossedUrl\n EnrolledUrl\n }\n allInstrumentOverviewsCount(\n googleId: $googleId\n category: $category\n instrumentType: $instrumentType\n sessionYear: $sessionYear\n sessionType: $sessionType\n customFilters: $filters\n search: $search\n )\n}", "variables": { - "sessionType": "2025 Regular Session", + "sessionType": self.session_year, "instrumentType": bill_type, "orderBy": "LastActionDate", "direction": "DESC", @@ -58,9 +63,8 @@ def scrape_bill_type(self, session: str, bill_type: str, offset: int, limit: int page = requests.post(self.gql_url, headers=self.gql_headers, json=json_data) page = json.loads(page.content) - if len(page["data"]["allInstrumentOverviews"]) < 1 and offset == 0: - # TODO: this fails if one chamber is empty and the other isn't - # raise EmptyScrape + + if len(page["data"]["allInstrumentOverviews"]) < 1: return for row in page["data"]["allInstrumentOverviews"]: @@ -115,14 +119,15 @@ def scrape_bill_type(self, session: str, bill_type: str, offset: int, limit: int bill.add_subject(first_sub[0]) if row["CompanionInstrumentNbr"] != "": - self.warning("AL Companion found. Code it up.") - - # TODO: EffectiveDateCertain, EffectiveDateOther + bill.add_related_bill( + row["CompanionInstrumentNbr"], session, "companion" + ) - # TODO: Fiscal notes, BUDGET ISOLATION RESOLUTION + # TODO: BUDGET ISOLATION RESOLUTION bill.extras["AL_BILL_ID"] = row["ID"] + self.count += 1 yield bill # no need to paginate again if we max the last page @@ -149,14 +154,9 @@ def scrape_versions(self, bill, row): media_type="application/pdf", ) - # the search JSON contains the act reference, but not the date, - # which we need to build the action. It's on the act page at the SoS though. - def scrape_act(self, bill: Bill, link: str): - act_page = lxml.html.fromstring(link) - link = act_page.xpath("//a")[0] - url = link.xpath("@href")[0] - act_number = link.xpath("text()")[0].replace("View Act", "").strip() - + # the search JSON contains the act reference, but not the final text + # so scrape it from the SoS + def scrape_act(self, bill: Bill, url: str, effective: str): try: page = self.get(url, timeout=120).content except requests.exceptions.ConnectTimeout: @@ -166,24 +166,12 @@ def scrape_act(self, bill: Bill, link: str): page = lxml.html.fromstring(page) page.make_links_absolute(url) - if not page.xpath( - '//tr[td[contains(text(),"Approved Date and Time")]]/td[2]/text()' - ): - return - - # second td in the row containing Approved Date and Time - act_date = page.xpath( - '//tr[td[contains(text(),"Approved Date and Time")]]/td[2]/text()' - )[0] - act_date = act_date.strip().replace(" ", "") - action_date = dateutil.parser.parse(act_date) - action_date = self.tz.localize(action_date) - bill.add_action( - chamber="executive", - description=f"Enacted as {act_number}", - date=action_date, - classification="became-law", + act_number = ( + page.xpath("//td[contains(text(), 'ACT NUMBER')]/text()")[0] + .replace("ACT NUMBER", "") + .strip() ) + act_number = act_number.replace(" ", "") if page.xpath("//a[input[@value='View Image']]"): act_text_url = page.xpath("//a[input[@value='View Image']]/@href")[0] @@ -191,16 +179,28 @@ def scrape_act(self, bill: Bill, link: str): f"Act {act_number}", act_text_url, media_type=get_media_type(act_text_url), + on_duplicate="ignore", ) bill.extras["AL_ACT_NUMBER"] = act_number - bill.add_citation( - "Alabama Chapter Law", act_number, "chapter", url=act_text_url - ) + if effective: + date_effective = dateutil.parser.parse(effective).date() + bill.add_citation( + "Alabama Chapter Law", + act_number, + "chapter", + url=act_text_url, + effective=date_effective, + ) + else: + bill.add_citation( + "Alabama Chapter Law", act_number, "chapter", url=act_text_url + ) def scrape_actions(self, bill, bill_row): bill_id = bill.identifier.replace(" ", "") + if bill_row["PrefiledDate"]: action_date = datetime.datetime.strptime( bill_row["PrefiledDate"], "%m/%d/%Y" @@ -235,7 +235,7 @@ def scrape_actions(self, bill, bill_row): if row["Committee"]: action_text = f'{row["Matter"]} ({row["Committee"]})' - action_date = datetime.datetime.strptime(row["CalendarDate"], "%m-%d-%Y") + action_date = dateutil.parser.parse(row["CalendarDate"]) action_date = self.tz.localize(action_date) action_attr = self.categorizer.categorize(row["Matter"]) @@ -252,31 +252,26 @@ def scrape_actions(self, bill, bill_row): ) if row["AmdSubUrl"] != "": - page = lxml.html.fromstring(row["AmdSubUrl"]) - link = page.xpath("//a")[0] - amd_url = link.xpath("@href")[0] - amd_name = link.xpath("text()")[0].strip() - amd_name = f"Amendment {amd_name}" - if row["Committee"] != "": - amd_name = f"{row['Committee']} {amd_name}" - bill.add_version_link( - amd_name, - url=amd_url, - media_type=get_media_type(amd_url), + row["Matter"], + url=row["AmdSubUrl"], + media_type=get_media_type(row["AmdSubUrl"]), + on_duplicate="ignore", ) if int(row["VoteNbr"]) > 0: yield from self.scrape_vote(bill, row) if bill_row["ViewEnacted"]: - self.scrape_act(bill, bill_row["ViewEnacted"]) + self.scrape_act( + bill, bill_row["ViewEnacted"], bill_row["EffectiveDateCertain"] + ) def scrape_fiscal_notes(self, bill): bill_id = bill.identifier.replace(" ", "") json_data = { - "query": "query fiscalNotesBySessionYearInstrumentNbr($instrumentNbr: String, $sessionType: String, $sessionYear: String){fiscalNotesBySessionYearInstrumentNbr(instrumentNbr:$instrumentNbr, sessionType:$sessionType, sessionYear: $sessionYear, ){ FiscalNoteDescription,FiscalNoteUrl,SortOrder }}", + "query": "query fiscalNotes($instrumentNbr: String, $sessionType: String, $sessionYear: String){fiscalNotes(instrumentNbr:$instrumentNbr, sessionType:$sessionType, sessionYear: $sessionYear, ){ FiscalNoteDescription,FiscalNoteUrl,SortOrder }}", "variables": { "instrumentNbr": bill_id, "sessionType": self.session_type, @@ -286,7 +281,7 @@ def scrape_fiscal_notes(self, bill): page = requests.post(self.gql_url, headers=self.gql_headers, json=json_data) page = json.loads(page.content) - for row in page["data"]["fiscalNotesBySessionYearInstrumentNbr"]: + for row in page["data"]["fiscalNotes"]: bill.add_document_link( f"Fiscal Note: {row['FiscalNoteDescription']}", row["FiscalNoteUrl"], @@ -352,5 +347,5 @@ def scrape_vote(self, bill, action_row): # The api gives us dates as m-d-Y but needs them in Y-m-d def transform_date(self, date: str) -> str: - date = datetime.datetime.strptime(date, "%m-%d-%Y") + date = dateutil.parser.parse(date) return date.strftime("%Y-%m-%d") diff --git a/scrapers/al/events.py b/scrapers/al/events.py index b29b48d9cd..852cafbe9a 100644 --- a/scrapers/al/events.py +++ b/scrapers/al/events.py @@ -1,7 +1,6 @@ import datetime import dateutil.parser import json -import pytz from utils import LXMLMixin from utils.events import match_coordinates @@ -11,9 +10,6 @@ class ALEventScraper(Scraper, LXMLMixin): - _TZ = pytz.timezone("US/Eastern") - _DATETIME_FORMAT = "%m/%d/%Y %I:%M %p" - def scrape(self, start=None): gql_url = "https://alison.legislature.state.al.us/graphql/" @@ -31,57 +27,37 @@ def scrape(self, start=None): start = datetime.datetime.today().replace(day=1).strftime("%Y-%m-%d") query = ( - '{hearingsMeetings(eventType:"meeting", body:"", keyword:"", toDate:"3000-12-31", ' - f'fromDate:"{start}", sortTime:"", direction:"ASC", orderBy:"SortTime", )' - "{ EventDt,EventTm,Location,EventTitle,EventDesc,Body,DeadlineDt,PublicHearing," - "Committee,AgendaUrl,SortTime,OidMeeting,LiveStream }}" + 'query meetings($body: OrganizationBody, $managedInLinx: Boolean, $autoScroll: Boolean!) {\n meetings(\n where: {body: {eq: $body}, startDate: {gte: "' + + start + + '"}, managedInLinx: {eq: $managedInLinx}}\n ) {\n data {\n id\n startDate\n startTime\n location\n title\n description\n body\n hasPublicHearing\n hasLiveStream\n committee\n agendaUrl\n agendaItems @skip(if: $autoScroll) {\n id\n sessionType\n sessionYear\n instrumentNumber\n shortTitle\n matter\n recommendation\n hasPublicHearing\n sponsor\n __typename\n }\n __typename\n }\n count\n __typename\n }\n}' ) json_data = { "query": query, - "operationName": "", - "variables": [], + "operationName": "meetings", + "variables": { + "autoScroll": False, + }, } page = self.post(gql_url, headers=headers, json=json_data) page = json.loads(page.content) - if len(page["data"]["hearingsMeetings"]) == 0: + if len(page["data"]["meetings"]["data"]) == 0: raise EmptyScrape - query = ( - '{hearingsMeetingsDetails(eventType:"meeting", body:"", keyword:"", toDate:"3000-12-31", ' - f'fromDate:"{start}", sortTime:"", direction:"ASC", orderBy:"SortTime", )' - "{EventDt,EventTm,Location,EventTitle,EventDesc,Body,DeadlineDt,PublicHearing," - "LiveStream,Committee,AgendaUrl,SortTime,OidMeeting, Sponsor, InstrumentNbr, ShortTitle, " - "OidInstrument, SessionType, SessionYear}}" - ) - json_data = { - "query": query, - "operationName": "", - "variables": [], - } - details = self.post(gql_url, headers=headers, json=json_data) - details = json.loads(details.content) - - bills = {} - for row in details["data"]["hearingsMeetingsDetails"]: - if row["OidMeeting"] not in bills: - bills[row["OidMeeting"]] = [] - bills[row["OidMeeting"]].append(row["InstrumentNbr"]) - event_keys = set() - for row in page["data"]["hearingsMeetings"]: - event_date = self._TZ.localize(dateutil.parser.parse(row["SortTime"])) - event_title = row["EventTitle"] - event_location = row["Location"] + for row in page["data"]["meetings"]["data"]: + event_date = dateutil.parser.parse(row["startDate"]) + event_title = row["title"] + event_location = row["location"] if event_location.startswith("Room"): event_location = ( f"11 South Union St, Montgomery, AL 36130. {event_location}" ) - event_desc = row["EventDesc"] + event_desc = row["description"] or "" event_key = f"{event_title}#{event_location}#{event_date}" @@ -104,28 +80,35 @@ def scrape(self, start=None): ) event.dedupe_key = event_key - # TODO: When they add committees, agendas, and video streams - match_coordinates( event, {"11 south union": (32.37707594063977, -86.29919861850152)} ) - for bill in bills.get(row["OidMeeting"], []): - event.add_bill(bill) + for agenda in row["agendaItems"]: + event.add_bill(agenda["instrumentNumber"]) - if row["AgendaUrl"]: - mime = get_media_type(row["AgendaUrl"], default="text/html") + if row["agendaUrl"]: + mime = get_media_type(row["agendaUrl"], default="text/html") event.add_document( - "Agenda", row["AgendaUrl"], media_type=mime, on_duplicate="ignore" + "Agenda", row["agendaUrl"], media_type=mime, on_duplicate="ignore" ) - com = row["Committee"] + com = row["committee"] if com: - com = f"{row['Body']} {com}" - com = com.replace("- House", "").replace("- Senate", "") + com = f"{row['body']} {com}" + com = ( + com.replace("- House", "") + .replace("- Senate", "") + .replace("(House)", "") + .replace("(Senate)", "") + ) event.add_committee(com) - # TODO, looks like we can generate a source link from the room and OID, - # does this stick after the event has ended? + # TODO: these break after the event passes. Is there any permalink? + if row["hasLiveStream"]: + # https://alison.legislature.state.al.us/live-stream?location=Room+200&meeting=%2223735%22 + event_url = f"https://alison.legislature.state.al.us/live-stream?location={row['location']}&meeting=%22{row['id']}%22" + event.add_source(event_url) + event.add_source("https://alison.legislature.state.al.us/todays-schedule") yield event