diff --git a/scrapers/va/__init__.py b/scrapers/va/__init__.py index a4c14b99fe..919b76aace 100644 --- a/scrapers/va/__init__.py +++ b/scrapers/va/__init__.py @@ -3,6 +3,7 @@ from openstates.scrape import State from .csv_bills import VaCSVBillScraper from .events import VaEventScraper +from .bills import VaBillScraper logging.getLogger(__name__).addHandler(logging.NullHandler()) @@ -13,7 +14,8 @@ class Virginia(State): scrapers = { "events": VaEventScraper, - "bills": VaCSVBillScraper, + "csv_bills": VaCSVBillScraper, + "bills": VaBillScraper, } legislative_sessions = [ { @@ -223,7 +225,16 @@ class Virginia(State): "start_date": "2024-05-13", # TODO: update actual end date "end_date": "2024-05-20", + "active": False, + }, + { + "_scraped_name": "2025 Session", + "identifier": "2025", + "name": "2025 Regular Session", + "start_date": "2025-01-08", + "end_date": "2025-02-22", "active": True, + "extras": {"session_code": "20251"}, }, ] ignored_scraped_sessions = [ diff --git a/scrapers/va/bills.py b/scrapers/va/bills.py new file mode 100644 index 0000000000..de63fc0728 --- /dev/null +++ b/scrapers/va/bills.py @@ -0,0 +1,322 @@ +import re + +import dateutil +import json +import lxml +import os +import pytz +import requests +import urllib3 + +from openstates.scrape import Scraper, Bill, VoteEvent +from .actions import Categorizer + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +four_digit_regex = re.compile("[0-9]{4}") + + +class VaBillScraper(Scraper): + tz = pytz.timezone("America/New_York") + headers: object = {} + base_url: str = "https://lis.virginia.gov" + session_code: str = "" + categorizer = Categorizer() + + chamber_map = { + "S": "upper", + "H": "lower", + } + + vote_map = { + "Y": "yes", + "N": "no", + "X": "not voting", + "A": "abstain", + "V": "other", + } + + ref_num_map: object = {} + + def scrape(self, session=None): + + for i in self.jurisdiction.legislative_sessions: + if i["identifier"] == session: + self.session_code = i["extras"]["session_code"] + + if not os.getenv("VA_API_KEY"): + self.error( + "Virginia requires an LIS api key. Register at https://lis.virginia.gov/developers \n API key registration can take days, the csv_bills scraper works without one." + ) + return + + self.headers = { + "WebAPIKey": os.getenv("VA_API_KEY"), + "Content-Type": "application/json", + "Accept": "application/json", + } + + body = {"SessionCode": self.session_code} + + page = requests.post( + f"{self.base_url}/Legislation/api/getlegislationlistasync", + headers=self.headers, + json=body, + verify=False, + ).json() + + for row in page["Legislations"]: + # print(json.dumps(row)) + + # the short title on the VA site is 'description', + # LegislationTitle is on top of all the versions + title = row["Description"] + subtitle = self.text_from_html(row["LegislationTitle"]) + description = self.text_from_html(row["LegislationSummary"]) + + bill = Bill( + row["LegislationNumber"], + session, + title, + chamber=self.chamber_map[row["ChamberCode"]], + classification=self.classify_bill(row), + ) + + self.add_actions(bill, row["LegislationID"]) + self.add_versions(bill, row["LegislationID"]) + self.add_carryover_related_bill(bill) + self.add_sponsors(bill, row["Patrons"]) + yield from self.add_votes(bill, row["LegislationID"]) + bill.add_abstract(subtitle, note="title") + bill.add_abstract(description, row["SummaryVersion"]) + + bill.extras["VA_LEG_ID"] = row["LegislationID"] + + bill.add_source( + f"https://lis.virginia.gov/bill-details/{self.session_code}/{row['LegislationNumber']}" + ) + + yield bill + + def add_actions(self, bill: Bill, legislation_id: str): + body = { + "sessionCode": self.session_code, + "legislationID": legislation_id, + } + + page = requests.get( + f"{self.base_url}/LegislationEvent/api/getlegislationeventbylegislationidasync", + params=body, + headers=self.headers, + verify=False, + ).json() + + for row in page["LegislationEvents"]: + when = dateutil.parser.parse(row["EventDate"]).date() + action_attr = self.categorizer.categorize(row["Description"]) + classification = action_attr["classification"] + + bill.add_action( + chamber=self.chamber_map[row["ChamberCode"]], + description=row["Description"], + date=when, + classification=classification, + ) + + # map reference numbers back to their actions for impact filenames + # HB9F122.PDF > { 'HB9F122' => "Impact statement from DPB (HB9)" } + if row["ReferenceNumber"]: + ref_num = row["ReferenceNumber"].split(".")[0] + self.ref_num_map[ref_num] = row["Description"] + + def add_carryover_related_bill(self, bill: Bill): + # Many bills are carried from one session to the next + # but this does not apply to special sessions + if four_digit_regex.fullmatch(bill.legislative_session) is not None: + prior_session = int(bill.legislative_session) - 1 + else: + return + + has_carryover_action = False + for action in bill.actions: + if ( + f"continued to {bill.legislative_session}" + in action["description"].lower() + ): + has_carryover_action = True + + if has_carryover_action: + bill.add_related_bill(bill.identifier, f"{prior_session}", "prior-session") + + def add_sponsors(self, bill: Bill, sponsors: list): + for row in sponsors: + primary = True if row["Name"] == "Chief Patron" else False + bill.add_sponsorship( + row["MemberDisplayName"], + chamber=self.chamber_map[row["ChamberCode"]], + entity_type="person", + classification="primary" if primary else "cosponsor", + primary=primary, + ) + + def add_versions(self, bill: Bill, legislation_id: str): + body = { + "sessionCode": self.session_code, + "legislationID": legislation_id, + } + page = requests.get( + f"{self.base_url}/LegislationText/api/getlegislationtextbyidasync", + params=body, + headers=self.headers, + verify=False, + ).json() + + for row in page["TextsList"]: + # print(json.dumps(row)) + if (row["PDFFile"] and len(row["PDFFile"]) > 1) or ( + row["HTMLFile"] and len(row["HTMLFile"]) > 1 + ): + self.error(json.dumps(row)) + self.error("Add code to handle multiple files to VA Scraper") + raise Exception + + if row["PDFFile"] and len(row["PDFFile"]) > 0: + bill.add_version_link( + row["Description"], + row["PDFFile"][0]["FileURL"], + media_type="application/pdf", + ) + + if row["HTMLFile"] and len(row["HTMLFile"]) > 0: + bill.add_version_link( + row["Description"], + row["HTMLFile"][0]["FileURL"], + media_type="text/html", + ) + + if row["ImpactFile"]: + for impact in row["ImpactFile"]: + # map 241HB9F122 => HB9F122 + action = self.ref_num_map[impact["ReferenceNumber"][3:]] + bill.add_document_link( + action, impact["FileURL"], media_type="application/pdf" + ) + + # This method doesn't work as of 2024-10-15 but leaving this code in, + # in case they bring it back + # def get_vote_types(self): + + # page = requests.get( + # f"{self.base_url}/api/getvotetypereferencesasync", + # headers=self.headers, + # verify=False, + # ).content + + # print(page) + + def add_votes(self, bill: Bill, legislation_id: str): + body = { + "sessionCode": self.session_code, + "legislationID": legislation_id, + } + + page = requests.get( + f"{self.base_url}/Vote/api/getvotebyidasync", + params=body, + headers=self.headers, + verify=False, + ).content + + if not page: + return + + page = json.loads(page) + + for row in page["Votes"]: + # VA Voice votes don't indicate pass fail, + # and right now OS core requires a pass or fail, so we skip them with a notice + if row["PassFail"] or row["IsVoice"] is not True: + vote_date = dateutil.parser.parse(row["VoteDate"]).date() + + motion_text = row["VoteActionDescription"] + + # the api returns 'Continued to %NextSessionYear% in Finance' so fix that + motion_text = motion_text.replace( + "%NextSessionYear%", str(vote_date.year + 1) + ) + + v = VoteEvent( + start_date=vote_date, + motion_text=motion_text, + bill_action=row["LegislationActionDescription"], + result="fail", # placeholder for now + chamber=self.chamber_map[row["ChamberCode"]], + bill=bill, + classification=[], + ) + + v.dedupe_key = row["BatchNumber"] + + tally = { + "Y": 0, + "N": 0, + "X": 0, # not voting + "A": 0, # abstain + "V": 0, # voting + } + + for subrow in row["VoteMember"]: + v.vote( + self.vote_map[subrow["ResponseCode"]], + subrow["MemberDisplayName"], + ) + + tally[subrow["ResponseCode"]] += 1 + + v.set_count("yes", tally["Y"]) + v.set_count("no", tally["N"]) + v.set_count("abstain", tally["A"]) + v.set_count("not voting", tally["X"]) + v.set_count("other", tally["V"]) + + if tally["Y"] == 0 and tally["N"] == 0 and tally["A"] == 0: + # some voice votes are miscoded so don't contain data + continue + + if tally["Y"] > tally["N"]: + v.result = "pass" + else: + v.result = "fail" + + # https://lis.virginia.gov/vote-details/HB88/20251/H1003V0001 + v.add_source( + f"https://lis.virginia.gov/vote-details/{row['VoteLegislation']}/{self.session_code}/{row['BatchNumber']}" + ) + yield v + else: + self.info(f"Skipping vote {row['BatchNumber']} with no pass fail") + + def classify_bill(self, row: dict): + btype = "bill" + + if "constitutional amendment" in row["Description"].lower(): + btype = "constitutional amendment" + + return btype + + # TODO: we can get the subject list, + # then do a search API call for each individual subject, + # but is there a faster way? + # def get_subjects(self): + # body = { + # "sessionCode": self.session_code, + # } + # page = requests.get( + # f"{self.base_url}/LegislationSubject/api/getsubjectreferencesasync", + # params=body, + # headers=self.headers, + # verify=False, + # ).json() + + def text_from_html(self, html: str): + return lxml.html.fromstring(html).text_content() diff --git a/scrapers/va/common.py b/scrapers/va/common.py index ca6ab52ad1..68952c3faf 100644 --- a/scrapers/va/common.py +++ b/scrapers/va/common.py @@ -29,6 +29,7 @@ "2023S1": "232", "2024": "241", "2024S1": "242", + "2025": "243", } COMBINED_SESSIONS = {"221": ["222", "231", "232", "241", "242"]} diff --git a/scrapers/va/csv_bills.py b/scrapers/va/csv_bills.py index 8ab39a6cc6..9ae38a022c 100644 --- a/scrapers/va/csv_bills.py +++ b/scrapers/va/csv_bills.py @@ -1,15 +1,13 @@ import csv import re import pytz -import datetime -from paramiko.client import SSHClient, AutoAddPolicy -import paramiko from openstates.scrape import Scraper, Bill, VoteEvent from collections import defaultdict -import time +import dateutil -from .common import SESSION_SITE_IDS, COMBINED_SESSIONS +from .common import SESSION_SITE_IDS from .actions import Categorizer +from scrapelib import HTTPError tz = pytz.timezone("America/New_York") @@ -28,56 +26,16 @@ class VaCSVBillScraper(Scraper): _session_id: int categorizer = Categorizer() - def _init_sftp(self, session_id): - client = SSHClient() - client.set_missing_host_key_policy(AutoAddPolicy) - connected = False - attempts = 0 - while not connected: - try: - client.connect( - "sftp.dlas.virginia.gov", - username="rjohnson", - password="E8Tmg%9Dn!e6dp", - compress=True, - ) - except paramiko.ssh_exception.AuthenticationException: - attempts += 1 - self.logger.warning( - f"Auth failure...sleeping {attempts * 30} seconds and retrying" - ) - # hacky backoff! - time.sleep(attempts * 30) - else: - connected = True - # importantly, we shouldn't try forever - if attempts > 3: - break - if not connected: - raise paramiko.ssh_exception.AuthenticationException - self.sftp = client.open_sftp() - """ - Set working directory for sftp client based on session - """ - for k, sessions in COMBINED_SESSIONS.items(): - if session_id in sessions: - self.sftp.chdir(f"/CSV{k}/csv{session_id}") - break - else: - """ - for -> else blocks only work when you've gone through - every step in a for loop without breaking - so this is kinda like setting a default - """ - self.sftp.chdir(f"/CSV{session_id}/csv{session_id}") - def get_file(self, filename): - # old sftp thing rn - return self.sftp.open(filename).read().decode(errors="ignore") - # keeping old filenames in case we ever need to go back to sftp - # filename = filename.lower().capitalize() - # url = f"https://lis.virginia.gov/SiteInformation/csv/{self._session_id}/{filename}" - # return self.get(url).text + # see https://lis.virginia.gov/data-files + # note: the url pattern given in the notes on that page is wrong, + # use the links at the bottom + try: + url = f"https://lis.blob.core.windows.net/lisfiles/{self._session_id}/{filename}" + return self.get(url).text + except HTTPError: + self.info(f"HTTP error on {url}, skipping") + return "" # Load members of legislative def load_members(self): @@ -249,7 +207,8 @@ def scrape(self, session=None): is_special = True session_id = SESSION_SITE_IDS[session] - self._init_sftp(session_id) + self._session_id = "20251" + # self._init_sftp(session_id) bill_url_base = "https://lis.virginia.gov/cgi-bin/" if not is_special: @@ -279,7 +238,9 @@ def scrape(self, session=None): chamber=chamber, classification=bill_type, ) - bill_url = bill_url_base + f"legp604.exe?{session_id}+sum+{bill_id}" + bill_url = ( + f"https://lis.virginia.gov/bill-details/{self._session_id}/{bill_id}" + ) b.add_source(bill_url) # Long Bill ID needs to have 6 characters to work with vote urls, sponsors, and summaries. @@ -325,11 +286,12 @@ def scrape(self, session=None): # Amendment docs amendments = self._amendments[bill_id] for amend in amendments: - doc_link = ( - bill_url_base + f"legp604.exe?{session_id}+amd+{amend['txt_docid']}" - ) + version_url = f"https://lis.virginia.gov/bill-details/{self._session_id}/{bill_id}/text/{amend['txt_docid'].strip()}" + b.add_document_link( - "Amendment: " + amend["txt_docid"], doc_link, media_type="text/html" + "Amendment: " + amend["txt_docid"], + version_url, + media_type="text/html", ) # fiscal notes @@ -347,7 +309,7 @@ def scrape(self, session=None): for hist in self._history[bill_id]: action = hist["history_description"] action_date = hist["history_date"] - date = datetime.datetime.strptime(action_date, "%m/%d/%y").date() + date = dateutil.parser.parse(action_date).date() chamber = chamber_types[action[0]] vote_id = hist["history_refid"] cleaned_action = action[2:] @@ -405,15 +367,8 @@ def scrape(self, session=None): for version in bill["text_docs"]: # Checks if abbr is blank as not every bill has multiple versions if version["doc_abbr"]: - version_url = ( - bill_url_base - + f"legp604.exe?{session_id}+ful+{version['doc_abbr']}" - ) - pdf_url = version_url + "+pdf" - - version_date = datetime.datetime.strptime( - version["doc_date"], "%m/%d/%y" - ).date() + version_url = f"https://lis.virginia.gov/bill-details/{self._session_id}/{bill_id}/text/{version['doc_abbr'].strip()}" + version_date = dateutil.parser.parse(version["doc_date"]).date() # version text will default to abbreviation provided in CSV # but if there is an unambiguous action from that date with # a version, we'll use that as the document title @@ -427,13 +382,5 @@ def scrape(self, session=None): media_type="text/html", on_duplicate="ignore", ) - b.add_version_link( - version_text, - pdf_url, - date=version_date, - media_type="application/pdf", - on_duplicate="ignore", - ) yield b - self.sftp.close() diff --git a/scrapers/va/events.py b/scrapers/va/events.py index 4bb0872d65..5b5a704ab9 100644 --- a/scrapers/va/events.py +++ b/scrapers/va/events.py @@ -1,256 +1,91 @@ from openstates.scrape import Scraper, Event -import lxml -import dateutil.parser -import re +import datetime +import dateutil +import json import pytz -from urllib import parse -from dateutil.tz import gettz - -from .common import SESSION_SITE_IDS +import re -# NOTE: because of how the bill scraper is imported, this must be run with -# VIRGINIA_FTP_USER="" VIRGINIA_FTP_PASSWORD="" PYTHONPATH=scrapers poetry run os-update va events --scrape -# You don't need a valid u/p for events, the env vars just need to be set. class VaEventScraper(Scraper): _tz = pytz.timezone("America/New_York") - tzinfos = {"EDT": gettz("America/New_York"), "EST": gettz("America/New_York")} - - def scrape(self, session): - session_id = SESSION_SITE_IDS[session] - - yield from self.scrape_lower() - yield from self.scrape_upper(session_id) - - def scrape_lower(self): - list_url = "https://virginiageneralassembly.gov/house/schedule/meetingSchedule.php?range=long" - page = self.get(list_url).content - page = lxml.html.fromstring(page) + def scrape(self, start_date=None): + # TODO: what's the deal with this WebAPIKey, will it expire? + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", + "WebAPIKey": "FCE351B6-9BD8-46E0-B18F-5572F4CCA5B9", + } + + # e.g. 10/10/2024 + if start_date: + start_date = dateutil.parser.parse(start_date).strftime("%m/%d/%Y") + else: + start_date = datetime.datetime.today().strftime("%m/%d/%Y") + + url = f"https://lis.virginia.gov/Schedule/api/GetScheduleListAsync?startDate={start_date}%2000:00:00" + page = self.get(url, verify=False, headers=headers) + page = json.loads(page.content) + for row in page["Schedules"]: + status = "tentative" + name = row["OwnerName"].strip() - page.make_links_absolute(list_url) + if name == "": + name = row["Description"].split(";")[0].strip() - for row in page.xpath("//table[contains(@class, 'CODayTable')]/tbody/tr"): - # TODO: it would be nice to go back in and update the record to mark it as cancelled, - # but since there's no ics link it makes the day logic way more complicated - if row.xpath(".//span[contains(@class, 'COCancelled')]"): - continue + # them seem to set all the dates to noon then + # add the actual time to a seperate field. + when_date = row["ScheduleDate"].replace("T12:00:00", "") + when_time = row["ScheduleTime"] - # fallback for unlinked events - source = "https://virginiageneralassembly.gov/house/schedule/meetingSchedule.php?range=long" + when = dateutil.parser.parse(f"{when_date} {when_time}") + when = self._tz.localize(when) - if row.xpath(".//a[1]/text()"): - title = row.xpath(".//a[1]/text()")[0].strip() - source = row.xpath(".//a[1]/@href")[0] - event_type = "committee-meeting" + if "RoomDescription" in row: + location = row["RoomDescription"] else: - # skip unlinked misc events - if row.xpath("td[contains(@class, 'COCommType')]/text()"): - title = row.xpath("td[contains(@class, 'COCommType')]/text()")[ - 0 - ].strip() - event_type = "other" - else: - continue - - # cancelled so we lose date/time info - if not row.xpath(".//a[@title='Add to Calendar']/@href"): - continue - - date_link = row.xpath(".//a[@title='Add to Calendar']/@href")[0] - parsed = parse.parse_qs(parse.urlparse(date_link).query) - date_raw = parsed["dt"][0] - loc_raw = parsed["loc"][0] - # Prevent invalid length of location name - location = loc_raw[:198] if len(loc_raw) > 199 else loc_raw - - start = dateutil.parser.parse(date_raw, tzinfos=self.tzinfos) - - # If there's a chair in parentheticals, remove them from the title - # and add as a person instead - chair_note = re.findall(r"\(.*\)", title) - chair = None - for chair_str in chair_note: - title = title.replace(chair_str, "").strip() - # drop the outer parens - chair = chair_str[1:-1] - - event = Event( - name=title, - start_date=start, - location_name=location, - classification=event_type, - ) - event.add_source(source) - event.dedupe_key = f"{title}#{location}#{start}" - - if chair is not None: - event.add_participant(chair, type="person", note="chair") - - if event_type == "committee-meeting": - event.add_participant(title, type="committee", note="host") - - if row.xpath(".//a[contains(@class,'COAgendaLink')]"): - agenda_url = row.xpath(".//a[contains(@class,'COAgendaLink')]/@href")[0] - event.add_document("Agenda", agenda_url, media_type="text/html") - self.scrape_lower_agenda(event, agenda_url) - - yield event - - def scrape_lower_agenda(self, event, url): - page = self.get(url).content - page = lxml.html.fromstring(page) + location = row["Description"] - page.make_links_absolute(url) + if location == "": + location = "See Agenda" - if page.xpath( - '//tr[td[contains(@class,"agendaLabel") and contains(text(), "Notes")]]/td[2]' - ): - note = page.xpath( - '//tr[td[contains(@class,"agendaLabel") and contains(text(), "Notes")]]/td[2]/text()' - )[0].strip() - event.add_agenda_item(note) - - for row in page.xpath('//div[contains(@class,"agendaContainer")]'): - title = row.xpath( - './/span[contains(@class,"reportBlockContainerCon")]/h2/text()' - )[0].strip() - agenda = event.add_agenda_item(title) - summary = row.xpath(".//table/@summary") - if not summary: - continue - summary = summary[0] - for bill in row.xpath('.//tr[contains(@class, "standardZebra")]/td[1]/a'): - name = bill.xpath("string()").strip() - if "Attachment" in summary: - url = bill.xpath("@href")[0] - agenda.add_media_link(name, url, media_type="application/pdf") - elif "Block of this committee" in summary: - bill_regex = re.compile(r"(HB|HJ|HR|SB|SJ|SR)[0-9]+") - if bill_regex.match(name): - agenda.add_bill(name) - else: - raise Exception("Invalid format of Bill ID") - else: - raise Exception("Unknown types of agenda") - - def scrape_upper(self, session_id): - list_url = f"https://lis.virginia.gov/cgi-bin/legp604.exe?{session_id}+oth+MTG&{session_id}+oth+MTG" - page = self.get(list_url).content - page = lxml.html.fromstring(page) - page.make_links_absolute(list_url) - - date = "" - time = "" - # note the [td] at the end, they have some empty tr-s so skip them - for row in page.xpath("//div[@id='mainC']/center/table//tr[td]"): - if row.xpath("td[1]/text()")[0].strip() != "": - date = row.xpath("td[1]/text()")[0].strip() - - time_col = row.xpath("td[2]/text()")[0] - status = "tentative" - if "cancelled" in time_col.lower(): + if row["IsCancelled"]: status = "cancelled" - if "a.m." in time_col or "p.m." in time_col: - time = time_col.replace("a.m.", "am").replace("p.m.", "pm").strip() - - when = dateutil.parser.parse(f"{date} {time}".strip()) - when = self._tz.localize(when) - - description = row.xpath("td[3]")[0].xpath("string()") - description = " ".join(description.split()).strip() - - # location is generally everything after the semicolon in the description - # it is sometimes the thing after colon in description - # removes these strings "- 1/2 hour, - 2 hours, - 30 minutes, - Immediately, (...)" in the description - desc_split = re.split( - r"(?:\:|;|\(|\)|-[\s\d\/\.]+(?:hour(?:s)?|minute(?:s)?|Immediately))", - description, - ) - if len(desc_split) > 1: - loc_raw = desc_split[1].strip() - # Prevent invalid length of location name - if len(loc_raw) > 1: - location = loc_raw[:198] if len(loc_raw) > 199 else loc_raw - else: - location = "Unknown" - else: - location = "Unknown" event = Event( - name=description, + name=name, start_date=when, classification="committee-meeting", location_name=location, status=status, + description=row["Description"], ) - event.add_source(list_url) - - # committee info & sub-committee info urls - committee_info_xpath = row.xpath( - './/a[contains(., "committee info")]/@href' - ) - # for senate only. - if "Senate" in description and committee_info_xpath: - committee_url = committee_info_xpath[0] - if "lis.virginia.gov" in committee_url: - self.scrape_upper_com(event, committee_url) + event.add_source("https://lis.virginia.gov/schedule") + + for ct, attach in enumerate(row["ScheduleFiles"]): + if ct == 0: + event.add_document( + "Agenda", + attach["FileURL"], + media_type="application/pdf", + ) + else: + event.add_document( + f"Attachment {ct}", + attach["FileURL"], + media_type="application/pdf", + ) + + if "press conference" not in name.lower(): + if "joint meeting of" in name.lower(): + coms = name.replace("Joint Meeting of", "") + # "joint meeting of com 1, com2 and com3" + # becomes ['com 1', 'com2', 'com3'] + for com in re.split(r",|and", coms, flags=re.I): + # the rstrip here catches some trailing dashes + com = com.strip().rstrip("- ") + if com: + event.add_committee(com) + else: + event.add_committee(name.strip()) yield event - - def scrape_upper_com(self, event, url): - page = self.get(url).content - page = lxml.html.fromstring(page) - page.make_links_absolute(url) - - # add members - for person in ( - page.xpath('//div[@id="mainC"]/p[./a[contains(@href, "mbr")]]')[0] - .xpath("string()") - .split(",") - ): - event.add_participant( - person.split("(")[0].strip(), - type="person", - note=person.split("(")[1].strip(") ").lower() - if "(" in person - else "member", - ) - - # add committee name - committee_name = ( - page.xpath('//div[@id="mainC"]/h3[@class="xpad"]')[0] - .xpath("string()") - .replace("\n", "") - .strip() - ) - event.add_participant(committee_name, type="committee", note="host") - # get the url for only event date - event_dt = event.start_date.strftime("%B %d") - # the url contains +com+ for committee. - if "com" in url: - # click committee dockets (only 1 url). used "for" statement to avoid exception. - for doc_url in page.xpath('//a[contains(@href, "DOC")]/@href'): - doc_page = self.get(doc_url).content - page = lxml.html.fromstring(doc_page) - page.make_links_absolute(url) - # click dockets for the current event date. only 1 url if exists. - for url in page.xpath(f'//a[contains(., "{event_dt}")]/@href'): - event.add_document("Agenda", url, media_type="text/html") - self.scrape_upper_agenda(event, url) - - def scrape_upper_agenda(self, event, url): - # scrape agenda title and bill ids - page = self.get(url).content - page = lxml.html.fromstring(page) - page.make_links_absolute(url) - - title = " ".join( - [sub_title.xpath("string()") for sub_title in page.xpath("//center/b")] - ) - agenda = event.add_agenda_item(title) - for row in page.xpath("//p[./b/b/a/@href]"): - bill = "".join(row.xpath("./b/b/a/text()")[0].replace(".", "").split()) - bill_regex = re.compile(r"(HB|HJ|HR|SB|SJ|SR)[0-9]+") - if bill_regex.match(bill): - agenda.add_bill(bill)