Skip to content

Commit

Permalink
AL: fixes for new site (#5050)
Browse files Browse the repository at this point in the history
* AL: updates for new site
  • Loading branch information
showerst authored Oct 10, 2024
1 parent 3ce9b5a commit fa24e15
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 102 deletions.
101 changes: 48 additions & 53 deletions scrapers/al/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import dateutil
import requests
from openstates.scrape import Scraper, Bill, VoteEvent
from openstates.exceptions import EmptyScrape
from utils.media import get_media_type
from .actions import Categorizer

Expand All @@ -22,6 +23,7 @@ class ALBillScraper(Scraper):
session_type = ""
bill_ids = set()
vote_keys = set()
count = 0

gql_headers = {
"Accept": "*/*",
Expand All @@ -40,13 +42,16 @@ def scrape(self, session):
for bill_type in ["B", "R"]:
yield from self.scrape_bill_type(session, bill_type, 0, 50)

if self.count == 0:
raise EmptyScrape

def scrape_bill_type(self, session: str, bill_type: str, offset: int, limit: int):
self.info(f"Scraping offset {offset} limit {limit}")
self.info(f"Scraping {bill_type} offset {offset} limit {limit}")

json_data = {
"query": "query bills($googleId: String, $category: String, $sessionYear: String, $sessionType: String, $direction: String, $orderBy: String, $offset: Int, $limit: Int, $filters: InstrumentOverviewInput! = {}, $search: String, $instrumentType: String) {\n allInstrumentOverviews(\n googleId: $googleId\n category: $category\n instrumentType: $instrumentType\n sessionYear: $sessionYear\n sessionType: $sessionType\n direction: $direction\n orderBy: $orderBy\n limit: $limit\n offset: $offset\n customFilters: $filters\n search: $search\n ) {\n ID\n SessionYear\n InstrumentNbr\n InstrumentSponsor\n SessionType\n Body\n Subject\n ShortTitle\n AssignedCommittee\n PrefiledDate\n FirstRead\n CurrentStatus\n LastAction\n LastActionDate\n ActSummary\n ViewEnacted\n CompanionInstrumentNbr\n EffectiveDateCertain\n EffectiveDateOther\n InstrumentType\n InstrumentUrl\n IntroducedUrl\n EngrossedUrl\n EnrolledUrl\n }\n allInstrumentOverviewsCount(\n googleId: $googleId\n category: $category\n instrumentType: $instrumentType\n sessionYear: $sessionYear\n sessionType: $sessionType\n customFilters: $filters\n search: $search\n )\n}",
"variables": {
"sessionType": "2025 Regular Session",
"sessionType": self.session_year,
"instrumentType": bill_type,
"orderBy": "LastActionDate",
"direction": "DESC",
Expand All @@ -58,9 +63,8 @@ def scrape_bill_type(self, session: str, bill_type: str, offset: int, limit: int

page = requests.post(self.gql_url, headers=self.gql_headers, json=json_data)
page = json.loads(page.content)
if len(page["data"]["allInstrumentOverviews"]) < 1 and offset == 0:
# TODO: this fails if one chamber is empty and the other isn't
# raise EmptyScrape

if len(page["data"]["allInstrumentOverviews"]) < 1:
return

for row in page["data"]["allInstrumentOverviews"]:
Expand Down Expand Up @@ -115,14 +119,15 @@ def scrape_bill_type(self, session: str, bill_type: str, offset: int, limit: int
bill.add_subject(first_sub[0])

if row["CompanionInstrumentNbr"] != "":
self.warning("AL Companion found. Code it up.")

# TODO: EffectiveDateCertain, EffectiveDateOther
bill.add_related_bill(
row["CompanionInstrumentNbr"], session, "companion"
)

# TODO: Fiscal notes, BUDGET ISOLATION RESOLUTION
# TODO: BUDGET ISOLATION RESOLUTION

bill.extras["AL_BILL_ID"] = row["ID"]

self.count += 1
yield bill

# no need to paginate again if we max the last page
Expand All @@ -149,14 +154,9 @@ def scrape_versions(self, bill, row):
media_type="application/pdf",
)

# the search JSON contains the act reference, but not the date,
# which we need to build the action. It's on the act page at the SoS though.
def scrape_act(self, bill: Bill, link: str):
act_page = lxml.html.fromstring(link)
link = act_page.xpath("//a")[0]
url = link.xpath("@href")[0]
act_number = link.xpath("text()")[0].replace("View Act", "").strip()

# the search JSON contains the act reference, but not the final text
# so scrape it from the SoS
def scrape_act(self, bill: Bill, url: str, effective: str):
try:
page = self.get(url, timeout=120).content
except requests.exceptions.ConnectTimeout:
Expand All @@ -166,41 +166,41 @@ def scrape_act(self, bill: Bill, link: str):
page = lxml.html.fromstring(page)
page.make_links_absolute(url)

if not page.xpath(
'//tr[td[contains(text(),"Approved Date and Time")]]/td[2]/text()'
):
return

# second td in the row containing Approved Date and Time
act_date = page.xpath(
'//tr[td[contains(text(),"Approved Date and Time")]]/td[2]/text()'
)[0]
act_date = act_date.strip().replace("&nbsp;", "")
action_date = dateutil.parser.parse(act_date)
action_date = self.tz.localize(action_date)
bill.add_action(
chamber="executive",
description=f"Enacted as {act_number}",
date=action_date,
classification="became-law",
act_number = (
page.xpath("//td[contains(text(), 'ACT NUMBER')]/text()")[0]
.replace("ACT NUMBER", "")
.strip()
)
act_number = act_number.replace(" ", "")

if page.xpath("//a[input[@value='View Image']]"):
act_text_url = page.xpath("//a[input[@value='View Image']]/@href")[0]
bill.add_version_link(
f"Act {act_number}",
act_text_url,
media_type=get_media_type(act_text_url),
on_duplicate="ignore",
)

bill.extras["AL_ACT_NUMBER"] = act_number

bill.add_citation(
"Alabama Chapter Law", act_number, "chapter", url=act_text_url
)
if effective:
date_effective = dateutil.parser.parse(effective).date()
bill.add_citation(
"Alabama Chapter Law",
act_number,
"chapter",
url=act_text_url,
effective=date_effective,
)
else:
bill.add_citation(
"Alabama Chapter Law", act_number, "chapter", url=act_text_url
)

def scrape_actions(self, bill, bill_row):
bill_id = bill.identifier.replace(" ", "")

if bill_row["PrefiledDate"]:
action_date = datetime.datetime.strptime(
bill_row["PrefiledDate"], "%m/%d/%Y"
Expand Down Expand Up @@ -235,7 +235,7 @@ def scrape_actions(self, bill, bill_row):
if row["Committee"]:
action_text = f'{row["Matter"]} ({row["Committee"]})'

action_date = datetime.datetime.strptime(row["CalendarDate"], "%m-%d-%Y")
action_date = dateutil.parser.parse(row["CalendarDate"])
action_date = self.tz.localize(action_date)

action_attr = self.categorizer.categorize(row["Matter"])
Expand All @@ -252,31 +252,26 @@ def scrape_actions(self, bill, bill_row):
)

if row["AmdSubUrl"] != "":
page = lxml.html.fromstring(row["AmdSubUrl"])
link = page.xpath("//a")[0]
amd_url = link.xpath("@href")[0]
amd_name = link.xpath("text()")[0].strip()
amd_name = f"Amendment {amd_name}"
if row["Committee"] != "":
amd_name = f"{row['Committee']} {amd_name}"

bill.add_version_link(
amd_name,
url=amd_url,
media_type=get_media_type(amd_url),
row["Matter"],
url=row["AmdSubUrl"],
media_type=get_media_type(row["AmdSubUrl"]),
on_duplicate="ignore",
)

if int(row["VoteNbr"]) > 0:
yield from self.scrape_vote(bill, row)

if bill_row["ViewEnacted"]:
self.scrape_act(bill, bill_row["ViewEnacted"])
self.scrape_act(
bill, bill_row["ViewEnacted"], bill_row["EffectiveDateCertain"]
)

def scrape_fiscal_notes(self, bill):
bill_id = bill.identifier.replace(" ", "")

json_data = {
"query": "query fiscalNotesBySessionYearInstrumentNbr($instrumentNbr: String, $sessionType: String, $sessionYear: String){fiscalNotesBySessionYearInstrumentNbr(instrumentNbr:$instrumentNbr, sessionType:$sessionType, sessionYear: $sessionYear, ){ FiscalNoteDescription,FiscalNoteUrl,SortOrder }}",
"query": "query fiscalNotes($instrumentNbr: String, $sessionType: String, $sessionYear: String){fiscalNotes(instrumentNbr:$instrumentNbr, sessionType:$sessionType, sessionYear: $sessionYear, ){ FiscalNoteDescription,FiscalNoteUrl,SortOrder }}",
"variables": {
"instrumentNbr": bill_id,
"sessionType": self.session_type,
Expand All @@ -286,7 +281,7 @@ def scrape_fiscal_notes(self, bill):

page = requests.post(self.gql_url, headers=self.gql_headers, json=json_data)
page = json.loads(page.content)
for row in page["data"]["fiscalNotesBySessionYearInstrumentNbr"]:
for row in page["data"]["fiscalNotes"]:
bill.add_document_link(
f"Fiscal Note: {row['FiscalNoteDescription']}",
row["FiscalNoteUrl"],
Expand Down Expand Up @@ -352,5 +347,5 @@ def scrape_vote(self, bill, action_row):

# The api gives us dates as m-d-Y but needs them in Y-m-d
def transform_date(self, date: str) -> str:
date = datetime.datetime.strptime(date, "%m-%d-%Y")
date = dateutil.parser.parse(date)
return date.strftime("%Y-%m-%d")
81 changes: 32 additions & 49 deletions scrapers/al/events.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import datetime
import dateutil.parser
import json
import pytz

from utils import LXMLMixin
from utils.events import match_coordinates
Expand All @@ -11,9 +10,6 @@


class ALEventScraper(Scraper, LXMLMixin):
_TZ = pytz.timezone("US/Eastern")
_DATETIME_FORMAT = "%m/%d/%Y %I:%M %p"

def scrape(self, start=None):
gql_url = "https://alison.legislature.state.al.us/graphql/"

Expand All @@ -31,57 +27,37 @@ def scrape(self, start=None):
start = datetime.datetime.today().replace(day=1).strftime("%Y-%m-%d")

query = (
'{hearingsMeetings(eventType:"meeting", body:"", keyword:"", toDate:"3000-12-31", '
f'fromDate:"{start}", sortTime:"", direction:"ASC", orderBy:"SortTime", )'
"{ EventDt,EventTm,Location,EventTitle,EventDesc,Body,DeadlineDt,PublicHearing,"
"Committee,AgendaUrl,SortTime,OidMeeting,LiveStream }}"
'query meetings($body: OrganizationBody, $managedInLinx: Boolean, $autoScroll: Boolean!) {\n meetings(\n where: {body: {eq: $body}, startDate: {gte: "'
+ start
+ '"}, managedInLinx: {eq: $managedInLinx}}\n ) {\n data {\n id\n startDate\n startTime\n location\n title\n description\n body\n hasPublicHearing\n hasLiveStream\n committee\n agendaUrl\n agendaItems @skip(if: $autoScroll) {\n id\n sessionType\n sessionYear\n instrumentNumber\n shortTitle\n matter\n recommendation\n hasPublicHearing\n sponsor\n __typename\n }\n __typename\n }\n count\n __typename\n }\n}'
)

json_data = {
"query": query,
"operationName": "",
"variables": [],
"operationName": "meetings",
"variables": {
"autoScroll": False,
},
}

page = self.post(gql_url, headers=headers, json=json_data)
page = json.loads(page.content)

if len(page["data"]["hearingsMeetings"]) == 0:
if len(page["data"]["meetings"]["data"]) == 0:
raise EmptyScrape

query = (
'{hearingsMeetingsDetails(eventType:"meeting", body:"", keyword:"", toDate:"3000-12-31", '
f'fromDate:"{start}", sortTime:"", direction:"ASC", orderBy:"SortTime", )'
"{EventDt,EventTm,Location,EventTitle,EventDesc,Body,DeadlineDt,PublicHearing,"
"LiveStream,Committee,AgendaUrl,SortTime,OidMeeting, Sponsor, InstrumentNbr, ShortTitle, "
"OidInstrument, SessionType, SessionYear}}"
)
json_data = {
"query": query,
"operationName": "",
"variables": [],
}
details = self.post(gql_url, headers=headers, json=json_data)
details = json.loads(details.content)

bills = {}
for row in details["data"]["hearingsMeetingsDetails"]:
if row["OidMeeting"] not in bills:
bills[row["OidMeeting"]] = []
bills[row["OidMeeting"]].append(row["InstrumentNbr"])

event_keys = set()

for row in page["data"]["hearingsMeetings"]:
event_date = self._TZ.localize(dateutil.parser.parse(row["SortTime"]))
event_title = row["EventTitle"]
event_location = row["Location"]
for row in page["data"]["meetings"]["data"]:
event_date = dateutil.parser.parse(row["startDate"])
event_title = row["title"]
event_location = row["location"]

if event_location.startswith("Room"):
event_location = (
f"11 South Union St, Montgomery, AL 36130. {event_location}"
)
event_desc = row["EventDesc"]
event_desc = row["description"] or ""

event_key = f"{event_title}#{event_location}#{event_date}"

Expand All @@ -104,28 +80,35 @@ def scrape(self, start=None):
)
event.dedupe_key = event_key

# TODO: When they add committees, agendas, and video streams

match_coordinates(
event, {"11 south union": (32.37707594063977, -86.29919861850152)}
)

for bill in bills.get(row["OidMeeting"], []):
event.add_bill(bill)
for agenda in row["agendaItems"]:
event.add_bill(agenda["instrumentNumber"])

if row["AgendaUrl"]:
mime = get_media_type(row["AgendaUrl"], default="text/html")
if row["agendaUrl"]:
mime = get_media_type(row["agendaUrl"], default="text/html")
event.add_document(
"Agenda", row["AgendaUrl"], media_type=mime, on_duplicate="ignore"
"Agenda", row["agendaUrl"], media_type=mime, on_duplicate="ignore"
)

com = row["Committee"]
com = row["committee"]
if com:
com = f"{row['Body']} {com}"
com = com.replace("- House", "").replace("- Senate", "")
com = f"{row['body']} {com}"
com = (
com.replace("- House", "")
.replace("- Senate", "")
.replace("(House)", "")
.replace("(Senate)", "")
)
event.add_committee(com)

# TODO, looks like we can generate a source link from the room and OID,
# does this stick after the event has ended?
# TODO: these break after the event passes. Is there any permalink?
if row["hasLiveStream"]:
# https://alison.legislature.state.al.us/live-stream?location=Room+200&meeting=%2223735%22
event_url = f"https://alison.legislature.state.al.us/live-stream?location={row['location']}&meeting=%22{row['id']}%22"
event.add_source(event_url)

event.add_source("https://alison.legislature.state.al.us/todays-schedule")
yield event

0 comments on commit fa24e15

Please sign in to comment.