Skip to content

Commit

Permalink
Delaware session updates (#5005)
Browse files Browse the repository at this point in the history
* DE: Use continuous session, set request headers

* DE: remove print statements
  • Loading branch information
showerst authored Aug 13, 2024
1 parent 7a01f20 commit 694d9f1
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 12 deletions.
14 changes: 10 additions & 4 deletions scrapers/de/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from utils import url_xpath
import requests
import lxml
from openstates.scrape import State
from .bills import DEBillScraper
from .events import DEEventScraper
Expand Down Expand Up @@ -137,9 +138,14 @@ class Delaware(State):
]

def get_session_list(self):
ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
headers = {"User-Agent": ua}

url = "https://legis.delaware.gov/"
sessions = url_xpath(
url, '//select[@id="billSearchGARefiner"]/option/text()', verify=False
)

page = requests.get(url, headers=headers, verify=False).content
page = lxml.html.fromstring(page)

sessions = page.xpath('//select[@id="billSearchGARefiner"]/option/text()')
sessions = [session.strip() for session in sessions if session.strip()]
return sessions
55 changes: 47 additions & 8 deletions scrapers/de/bills.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import datetime as dt
import json

import requests
from openstates.scrape import Scraper, Bill, VoteEvent
from utils import LXMLMixin
from .actions import Categorizer
Expand All @@ -14,6 +14,9 @@ class DEBillScraper(Scraper, LXMLMixin):
legislators = {}
legislators_by_short = {}
legislators_by_district = {}

session = requests.Session()

"""
DE has caucus-specific sites that it now
uses to identify bill sponsors...sometimes.
Expand All @@ -28,9 +31,28 @@ class DEBillScraper(Scraper, LXMLMixin):
}

def scrape(self, session=None):

self.headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-US,en;q=0.9",
"dnt": "1",
"priority": "u=0, i",
"sec-ch-ua": '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
}

self.retry_attempts = 10
self.retry_wait_seconds = 30
self.timeout = 130
self.init_asp()
self.headers["x-requested-with"] = "XMLHttpRequest"
# Cache the legislators, we'll need them for sponsors and votes
self.scrape_legislators(session)

Expand Down Expand Up @@ -250,12 +272,15 @@ def scrape_legislators(self, session):
}

self.info("Fetching legislators")
page = self.post(
page = self.session.post(
url=search_form_url,
data=form,
allow_redirects=True,
verify=False,
).json()
headers=self.headers,
).content

page = json.loads(page)
assert page["Data"], "Cound not fetch legislators!"
for row in page["Data"]:
self.legislators[str(row["PersonId"])] = row
Expand All @@ -272,7 +297,9 @@ def scrape_votes(self, bill, legislation_id, session):
)
form = {"legislationId": legislation_id, "sort": "", "group": "", "filter": ""}
self.info(f"Searching for votes for {bill.identifier}")
response = self.post(url=votes_url, data=form, allow_redirects=True)
response = self.session.post(
url=votes_url, data=form, allow_redirects=True, headers=self.headers
)
if response.content:
page = json.loads(response.content.decode("utf-8"))
if page["Total"] > 0:
Expand All @@ -286,7 +313,9 @@ def scrape_vote(self, bill, vote_id, session):
form = {"rollCallId": vote_id, "sort": "", "group": "", "filter": ""}

self.info(f"Fetching vote {vote_id} for {bill.identifier}")
page = self.post(url=vote_url, data=form, allow_redirects=True).json()
page = self.session.post(
url=vote_url, data=form, allow_redirects=True, headers=self.headers
).json()
if page:
roll = page["Model"]
vote_chamber = self.chamber_map[roll["ChamberName"]]
Expand Down Expand Up @@ -381,7 +410,9 @@ def scrape_actions(self, bill, legislation_id):
)
form = {"legislationId": legislation_id, "sort": "", "group": "", "filter": ""}
self.info(f"Fetching actions for {bill.identifier}")
page = self.post(url=actions_url, data=form, allow_redirects=True).json()
page = self.session.post(
url=actions_url, data=form, allow_redirects=True, headers=self.headers
).json()
for row in page["Data"]:
action_name = row["ActionDescription"]
action_date = dt.datetime.strptime(
Expand Down Expand Up @@ -423,7 +454,9 @@ def scrape_amendments(self, bill, legislation_id):
)
form = {"sort": "", "group": "", "filter": ""}
self.info(f"Fetching amendments for {bill.identifier}")
page = self.post(url=amds_url, data=form, allow_redirects=True)
page = self.session.post(
url=amds_url, data=form, allow_redirects=True, headers=self.headers
)
if page.content == b"":
return
else:
Expand Down Expand Up @@ -500,11 +533,12 @@ def post_search(self, session, page_number, per_page):
"toIntroDate": "",
}

page = self.post(
page = self.session.post(
url=search_form_url,
data=form,
allow_redirects=True,
verify=False,
headers=self.headers,
).json()

return page
Expand All @@ -518,3 +552,8 @@ def mime_from_link(self, link):
return "application/msword"
else:
return ""

# set up our asp session and fetch cookies
def init_asp(self):
self.session.get("https://legis.delaware.gov/", headers=self.headers).content
return

0 comments on commit 694d9f1

Please sign in to comment.