Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Alabama gql updates #5126

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
266 changes: 236 additions & 30 deletions scrapers/al/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
import json
import lxml
import re
import datetime
import dateutil
import requests
from openstates.scrape import Scraper, Bill, VoteEvent
from openstates.exceptions import EmptyScrape
from utils.media import get_media_type
from .actions import Categorizer
import pprint


class ALBillScraper(Scraper):
Expand Down Expand Up @@ -49,9 +49,65 @@ def scrape_bill_type(self, session: str, bill_type: str, offset: int, limit: int
self.info(f"Scraping {bill_type} offset {offset} limit {limit}")

json_data = {
"query": "query bills($googleId: String, $category: String, $sessionYear: String, $sessionType: String, $direction: String, $orderBy: String, $offset: Int, $limit: Int, $filters: InstrumentOverviewInput! = {}, $search: String, $instrumentType: String) {\n allInstrumentOverviews(\n googleId: $googleId\n category: $category\n instrumentType: $instrumentType\n sessionYear: $sessionYear\n sessionType: $sessionType\n direction: $direction\n orderBy: $orderBy\n limit: $limit\n offset: $offset\n customFilters: $filters\n search: $search\n ) {\n ID\n SessionYear\n InstrumentNbr\n InstrumentSponsor\n SessionType\n Body\n Subject\n ShortTitle\n AssignedCommittee\n PrefiledDate\n FirstRead\n CurrentStatus\n LastAction\n LastActionDate\n ActSummary\n ViewEnacted\n CompanionInstrumentNbr\n EffectiveDateCertain\n EffectiveDateOther\n InstrumentType\n InstrumentUrl\n IntroducedUrl\n EngrossedUrl\n EnrolledUrl\n }\n allInstrumentOverviewsCount(\n googleId: $googleId\n category: $category\n instrumentType: $instrumentType\n sessionYear: $sessionYear\n sessionType: $sessionType\n customFilters: $filters\n search: $search\n )\n}",
"query": """query bills($googleId: ID, $category: String, $instrumentType: InstrumentType, $sessionYear: Int, $sessionType: String, $order: Order = [
"sessionYear",
"DESC"], $offset: Int, $limit: Int, $where: InstrumentOverviewWhere! = {}, $search: String) {
instrumentOverviews(
googleId: $googleId
category: $category
where: [{instrumentType: {eq: $instrumentType}, sessionYear: {eq: $sessionYear}, sessionType: {eq: $sessionType}}, $where]
order: $order
limit: $limit
offset: $offset
search: $search
) {
data {
...billModalDataFragment
id
sessionYear
instrumentNbr
instrumentSponsor
sessionType
body
subject
shortTitle
assignedCommittee
prefiledDate
firstRead
currentStatus
lastAction
actSummary
viewEnacted
companionInstrumentNbr
effectiveDateCertain
effectiveDateOther
instrumentType
__typename
}
count
__typename
}
}
fragment billModalDataFragment on InstrumentOverview {
id
instrumentNbr
instrumentType
instrumentSponsor
instrumentUrl
introducedUrl
engrossedUrl
enrolledUrl
companionInstrumentNbr
sessionType
sessionYear
instrumentNbr
actSummary
effectiveDateCertain
effectiveDateOther
__typename
}""",
"variables": {
"sessionType": self.session_year,
"sessionType": "2025 Regular Session",
"instrumentType": bill_type,
"orderBy": "LastActionDate",
"direction": "DESC",
Expand All @@ -61,22 +117,29 @@ def scrape_bill_type(self, session: str, bill_type: str, offset: int, limit: int
},
}

# print(json_data)
page = requests.post(self.gql_url, headers=self.gql_headers, json=json_data)
page = json.loads(page.content)

if len(page["data"]["allInstrumentOverviews"]) < 1:
# print(page)

if page["data"]["instrumentOverviews"]["count"] < 1:
return

for row in page["data"]["allInstrumentOverviews"]:
chamber = self.chamber_map[row["Body"]]
title = row["ShortTitle"].strip()
for row in page["data"]["instrumentOverviews"]["data"]:

self.scrape_rest(None, row)
assert False

chamber = self.chamber_map[row["body"]]
title = row["shortTitle"].strip()

# some recently filed bills have no title, but a good subject which is close
if title == "":
title = row["Subject"]

# prevent duplicates
bill_id = row["InstrumentNbr"]
bill_id = row["instrumentNbr"]
if bill_id in self.bill_ids:
continue
else:
Expand All @@ -87,9 +150,9 @@ def scrape_bill_type(self, session: str, bill_type: str, offset: int, limit: int
legislative_session=session,
title=title,
chamber=chamber,
classification=self.bill_types[row["InstrumentType"]],
classification=self.bill_types[row["instrumentType"]],
)
sponsor = row["InstrumentSponsor"]
sponsor = row["instrumentSponsor"]
if sponsor == "":
self.warning("No sponsors")
continue
Expand All @@ -106,8 +169,8 @@ def scrape_bill_type(self, session: str, bill_type: str, offset: int, limit: int
yield from self.scrape_actions(bill, row)

bill.add_source("https://alison.legislature.state.al.us/bill-search")
if row["InstrumentUrl"]:
bill.add_source(row["InstrumentUrl"])
if row["instrumentUrl"]:
bill.add_source(row["instrumentUrl"])

# some subjects are super long & more like abstracts, but it looks like whatever is before a comma or
# semicolon is a clear enough subject. Adds the full given Subject as an Abstract & splits to add that
Expand All @@ -118,9 +181,9 @@ def scrape_bill_type(self, session: str, bill_type: str, offset: int, limit: int
first_sub = re.split(",|;", full_subject)
bill.add_subject(first_sub[0])

if row["CompanionInstrumentNbr"] != "":
if row["companionInstrumentNbr"] != "":
bill.add_related_bill(
row["CompanionInstrumentNbr"], session, "companion"
row["companionInstrumentNbr"], session, "companion"
)

# TODO: BUDGET ISOLATION RESOLUTION
Expand All @@ -134,23 +197,149 @@ def scrape_bill_type(self, session: str, bill_type: str, offset: int, limit: int
if page["data"]["allInstrumentOverviewsCount"] > offset:
yield from self.scrape_bill_type(session, bill_type, offset + 50, limit)

def scrape_rest(self, bill, row):

pprint.pprint(row)

json_data = {
"query": """query billModal(
$sessionType: String
$sessionYear: Int
$instrumentNbr: String
$instrumentType: InstrumentType
) {
instrument: instrumentOverview(
where: {
sessionType: { eq: $sessionType }
sessionYear: { eq: $sessionYear }
instrumentNbr: { eq: $instrumentNbr }
instrumentType: { eq: $instrumentType }
}
) {
id
instrumentNbr
sessionType
currentStatus
shortTitle
introducedUrl
engrossedUrl
enrolledUrl
viewEnacted
viewEnacted
actNbr
__typename
}
fiscalNotes(
where: {
sessionType: { eq: $sessionType }
sessionYear: { eq: $sessionYear }
instrumentNbr: { eq: $instrumentNbr }
}
) {
data {
description
url
sortOrder
__typename
}
__typename
}

histories: instrumentHistories(
where: {
sessionType: { eq: $sessionType }
sessionYear: { eq: $sessionYear }
instrumentNbr: { eq: $instrumentNbr }
}
) {
data {
instrumentNbr
sessionYear
sessionType
calendarDate
body
matter
amdSubUrl
committee
nays
yeas
vote
voteNbr
amdSub
...rollVoteModalInstrumentHistoryFragment
__typename
}
__typename
}
birs(
where: {
sessionType: { eq: $sessionType }
instrumentNbr: { eq: $instrumentNbr }
}
) {
data {
instrumentNbr
sessionYear
sessionType
bir
calendarDate
matter
roll
...rollVoteModalBirFragment
__typename
}
__typename
}
__typename

}
fragment rollVoteModalInstrumentHistoryFragment on InstrumentHistory {
__typename
instrumentNbr
sessionType
calendarDate
body
voteNbr
}
fragment rollVoteModalBirFragment on BudgetIsolationResolution {
__typename
instrumentNbr
bir
calendarDate
roll
}
""",
"variables": {
"__typename": "InstrumentOverview",
"id": row["id"],
"instrumentNbr": row["instrumentNbr"],
"sessionType": row["sessionType"],
"sessionYear": row["sessionYear"],
},
}

page = self.post(self.gql_url, headers=self.gql_headers, json=json_data)
print(page.content)
page = json.loads(page.content)
pprint.pprint(page)

def scrape_versions(self, bill, row):
if row["IntroducedUrl"]:
if row["introducedUrl"]:
bill.add_version_link(
"Introduced",
url=row["IntroducedUrl"],
url=row["introducedUrl"],
media_type="application/pdf",
)
if row["EngrossedUrl"]:
if row["engrossedUrl"]:
bill.add_version_link(
"Engrossed",
url=row["EngrossedUrl"],
url=row["engrossedUrl"],
media_type="application/pdf",
)
if row["EnrolledUrl"]:
if row["enrolledUrl"]:
bill.add_version_link(
"Enrolled",
url=row["EnrolledUrl"],
url=row["enrolledUrl"],
media_type="application/pdf",
)

Expand Down Expand Up @@ -201,13 +390,11 @@ def scrape_act(self, bill: Bill, url: str, effective: str):
def scrape_actions(self, bill, bill_row):
bill_id = bill.identifier.replace(" ", "")

if bill_row["PrefiledDate"]:
action_date = datetime.datetime.strptime(
bill_row["PrefiledDate"], "%m/%d/%Y"
)
if bill_row["prefiledDate"]:
action_date = dateutil.parser.parse(bill_row["prefiledDate"])
action_date = self.tz.localize(action_date)
bill.add_action(
chamber=self.chamber_map[bill_row["Body"]],
chamber=self.chamber_map[bill_row["body"]],
description="Filed",
date=action_date,
classification="filing",
Expand Down Expand Up @@ -270,21 +457,40 @@ def scrape_actions(self, bill, bill_row):
def scrape_fiscal_notes(self, bill):
bill_id = bill.identifier.replace(" ", "")

print(self.session_type, self.session_year)

json_data = {
"query": "query fiscalNotes($instrumentNbr: String, $sessionType: String, $sessionYear: String){fiscalNotes(instrumentNbr:$instrumentNbr, sessionType:$sessionType, sessionYear: $sessionYear, ){ FiscalNoteDescription,FiscalNoteUrl,SortOrder }}",
"query": """
query {
fiscalNotes(
where: {sessionType: {eq: "2024 Regular Session"}, sessionYear: {eq: 2024}, instrumentNbr: {eq: "HB1"}}
)
{
data {
description
url
sortOrder
__typename
}
}
}
""",
"variables": {
"instrumentNbr": bill_id,
"sessionType": self.session_type,
"sessionYear": self.session_year,
"sessionYear": self.session_year + " Regular Session",
},
}

print(json_data)
page = requests.post(self.gql_url, headers=self.gql_headers, json=json_data)

print(page.content)
page = json.loads(page.content)
for row in page["data"]["fiscalNotes"]:
for row in page["data"]["fiscalNotes"]["data"]:
bill.add_document_link(
f"Fiscal Note: {row['FiscalNoteDescription']}",
row["FiscalNoteUrl"],
f"Fiscal Note: {row['description']}",
row["url"],
media_type="application/pdf",
on_duplicate="ignore",
)
Expand Down
Loading