Skip to content

Commit

Permalink
VA: Bills: add 2024 prefiles, remove sftp
Browse files Browse the repository at this point in the history
  • Loading branch information
showerst committed Nov 29, 2023
1 parent e6ce032 commit 59cecd8
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 51 deletions.
9 changes: 8 additions & 1 deletion scrapers/va/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,11 +206,18 @@ class Virginia(State):
"name": "2023, 1st Special Session",
"start_date": "2023-08-05",
"end_date": "2023-08-13",
"active": False,
},
{
"_scraped_name": "2024 Session",
"identifier": "2024",
"name": "2024 Regular Session",
"start_date": "2024-01-10",
"end_date": "2024-03-09",
"active": True,
},
]
ignored_scraped_sessions = [
"2024",
"2021 Special Session I",
"2015 Special Session I",
"2015 Session",
Expand Down
1 change: 1 addition & 0 deletions scrapers/va/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"2022S1": "222",
"2023": "231",
"2023S1": "232",
"2024": "241",
}

COMBINED_SESSIONS = {"221": ["222", "231", "232"]}
57 changes: 7 additions & 50 deletions scrapers/va/csv_bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,10 @@
import re
import pytz
import datetime
from paramiko.client import SSHClient, AutoAddPolicy
import paramiko
from openstates.scrape import Scraper, Bill, VoteEvent
from collections import defaultdict
import time

from .common import SESSION_SITE_IDS, COMBINED_SESSIONS
from .common import SESSION_SITE_IDS
from .actions import Categorizer

tz = pytz.timezone("America/New_York")
Expand All @@ -25,53 +22,14 @@ class VaCSVBillScraper(Scraper):
_bills = defaultdict(list)
_summaries = defaultdict(list)

_session_id: int
categorizer = Categorizer()

def _init_sftp(self, session_id):
client = SSHClient()
client.set_missing_host_key_policy(AutoAddPolicy)
connected = False
attempts = 0
while not connected:
try:
client.connect(
"sftp.dlas.virginia.gov",
username="rjohnson",
password="E8Tmg%9Dn!e6dp",
compress=True,
)
except paramiko.ssh_exception.AuthenticationException:
attempts += 1
self.logger.warning(
f"Auth failure...sleeping {attempts * 30} seconds and retrying"
)
# hacky backoff!
time.sleep(attempts * 30)
else:
connected = True
# importantly, we shouldn't try forever
if attempts > 3:
break
if not connected:
raise paramiko.ssh_exception.AuthenticationException
self.sftp = client.open_sftp()
"""
Set working directory for sftp client based on session
"""
for k, sessions in COMBINED_SESSIONS.items():
if session_id in sessions:
self.sftp.chdir(f"/CSV{k}/csv{session_id}")
break
else:
"""
for -> else blocks only work when you've gone through
every step in a for loop without breaking
so this is kinda like setting a default
"""
self.sftp.chdir(f"/CSV{session_id}/csv{session_id}")

def get_file(self, filename):
return self.sftp.open(filename).read().decode(errors="ignore")
# keeping old filenames in case we ever need to go back to sftp
filename = filename.lower().capitalize()
url = f"https://lis.virginia.gov/SiteInformation/csv/{self._session_id}/{filename}"
return self.get(url).text

# Load members of legislative
def load_members(self):
Expand Down Expand Up @@ -243,7 +201,7 @@ def scrape(self, session=None):
is_special = True

session_id = SESSION_SITE_IDS[session]
self._init_sftp(session_id)
self._session_id = session_id
bill_url_base = "https://lis.virginia.gov/cgi-bin/"

if not is_special:
Expand Down Expand Up @@ -429,4 +387,3 @@ def scrape(self, session=None):
)

yield b
self.sftp.close()

0 comments on commit 59cecd8

Please sign in to comment.