Skip to content

Commit

Permalink
Merge pull request #4792 from braykuka/1139-mo-scrape-events-from-xml…
Browse files Browse the repository at this point in the history
…-source

MO: scrape events for MO House from XML source
  • Loading branch information
jessemortenson authored Jan 12, 2024
2 parents 5d6b8b8 + 709e3a8 commit 5022048
Showing 1 changed file with 33 additions and 40 deletions.
73 changes: 33 additions & 40 deletions scrapers/mo/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,34 +159,40 @@ def scrape_upper(self):
yield event

def scrape_lower(self):
listing_url = "https://house.mo.gov/HearingsTimeOrder.aspx"
headers = {
"sec-ch-ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
"Referer": "https://documents.house.mo.gov/",
"sec-ch-ua-mobile": "?0",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"sec-ch-ua-platform": '"Windows"',
}
response = self.get(
"https://documents.house.mo.gov/SessionSet.js?v=2", headers=headers
)
session_code_re = re.compile(r"sessionyearcode = '(\d+)'")
session_code = session_code_re.search(response.text).group(1)

html = self.get(listing_url).text
listing_xml_url = (
f"https://documents.house.mo.gov/xml/{session_code}-UpcomingHearingList.XML"
)

# The HTML here isn't wrapped in a container per-event
# which makes xpath a pain. So string split by <hr>
# then parse each event's fragment for cleaner results
events = set()
for fragment in html.split("<hr />")[1:]:
page = lxml.html.fromstring(fragment)
xml_content = self.get(listing_xml_url).text
tree = lxml.html.fromstring(xml_content)

# Skip date header rows
if page.xpath('//div[@id="DateGroup"]'):
continue
else:
for item in self.scrape_lower_item(page):
if item.dedupe_key in events:
self.warning(f"Skipping duplicate event: {item.dedupe_key}")
continue
events.add(item.dedupe_key)
yield item
events = set()
for hearing in tree.xpath("//hearinginfo"):
for item in self.scrape_lower_item(hearing):
if item.dedupe_key in events:
self.warning(f"Skipping duplicate event: {item.dedupe_key}")
continue
events.add(item.dedupe_key)
yield item

def scrape_lower_item(self, page):
# print(lxml.etree.tostring(page, pretty_print=True))
com = self.table_row_content(page, "Committee:")
when_date = self.table_row_content(page, "Date:")
when_time = self.table_row_content(page, "Time:")
location = self.table_row_content(page, "Location:")
com = page.xpath("./committeename/text()")[0]
when_date = page.xpath("./hearingdate/text()")[0]
when_time = page.xpath("./hearingtime/text()")[0]
location = page.xpath("./hearinglocation/text()")[0]

if "house hearing room" in location.lower():
location = f"{location}, 201 W Capitol Ave, Jefferson City, MO 65101"
Expand Down Expand Up @@ -223,25 +229,21 @@ def scrape_lower_item(self, page):

event.add_participant(com, type="committee", note="host")

# different from general MO link xpath due to the <b>
house_link_xpath = (
'.//a[contains(@href, "Bill.aspx") '
'or contains(@href, "bill.aspx")]/b/text()'
)

match_coordinates(
event,
{
"201 W Capitol Ave": (38.57918, -92.17306),
},
)

for bill_title in page.xpath(house_link_xpath):
bill_no = bill_title.split("--")[0].strip()
for bill in page.xpath("./hearingbills/hearingbill"):
bill_title = bill.xpath("./shorttitle/text()")[0]
bill_no = bill.xpath("./currentbillstring/text()")[0]
bill_no = bill_no.replace("HCS", "").strip()

agenda_item = event.add_agenda_item(description=bill_title)
add_bill_to_agenda(agenda_item, bill_no)

yield event

# Given <td><b>header</b> other text</td>,
Expand All @@ -254,12 +256,3 @@ def row_content(self, page, header):
return content[0].strip()
else:
return ""

def table_row_content(self, page, header):
content = page.xpath(
'string(.//tr[td[contains(string(.), "{}")]]/td[2])'.format(header)
)
if len(content) > 0:
return content.strip()
else:
return ""

0 comments on commit 5022048

Please sign in to comment.