Skip to content

Commit

Permalink
Merge pull request #5123 from openstates/mt-events-fix-duplicates
Browse files Browse the repository at this point in the history
MT: fix import error due to duplicate events
  • Loading branch information
jessemortenson authored Dec 9, 2024
2 parents 16c3085 + 1c6900e commit 9934948
Showing 1 changed file with 45 additions and 15 deletions.
60 changes: 45 additions & 15 deletions scrapers/mt/events.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Union

from openstates.scrape import Scraper, Event
from utils.events import match_coordinates
import datetime
Expand All @@ -10,17 +12,21 @@

class MTEventScraper(Scraper):
_tz = pytz.timezone("America/Denver")
# the same MT event can be listed more than once at the source URLs
# where each listing is an alternate media stream (video vs. audio)
# so we need to do some data combining before yielding
_events = []

def scrape(self):

yield from self.scrape_upcoming()
self.scrape_upcoming()

# scrape events from this month, and last month
today = datetime.date.today()
yield from self.scrape_cal_month(today)
yield from self.scrape_cal_month(
today + dateutil.relativedelta.relativedelta(months=-1)
)
self.scrape_cal_month(today)
self.scrape_cal_month(today + dateutil.relativedelta.relativedelta(months=-1))
for event in self._events:
yield event

def scrape_upcoming(self):
url = "https://sg001-harmony.sliq.net/00309/Harmony/en/View/UpcomingEvents"
Expand All @@ -30,7 +36,7 @@ def scrape_upcoming(self):
page.make_links_absolute(url)

for link in page.xpath("//div[@class='divEvent']/a[1]"):
yield from self.scrape_event(link.xpath("@href")[0])
self.scrape_event(link.xpath("@href")[0])

def scrape_cal_month(self, when: datetime.datetime.date):
date_str = when.strftime("%Y%m01")
Expand All @@ -43,7 +49,7 @@ def scrape_cal_month(self, when: datetime.datetime.date):
if when.date() < datetime.datetime.today().date():
event_id = str(row["Id"])
event_url = f"https://sg001-harmony.sliq.net/00309/Harmony/en/PowerBrowser/PowerBrowserV2/1/-1/{event_id}"
yield from self.scrape_event(event_url)
self.scrape_event(event_url)

def scrape_event(self, url: str):
html = self.get(url).text
Expand All @@ -62,17 +68,25 @@ def scrape_event(self, url: str):
when = dateutil.parser.parse(f"{when_date} {when_time}")
when = self._tz.localize(when)

event = Event(
name=title,
location_name=location,
start_date=when,
classification="committee-meeting",
)
# Check if event already exists in the self._events list
# and if so, add data to that instead of creating duplicate
existing_event = self.check_for_existing_event(title, location, when)
if existing_event is None:
# No existing event found, create one
event = Event(
name=title,
location_name=location,
start_date=when,
classification="committee-meeting",
)
else:
event = existing_event

self.scrape_versions(event, html)
self.scrape_media(event, html)

event.add_source(url)
if existing_event is None:
event.add_source(url)

if "HB" not in title.lower() and "SB" not in title.lower():
event.add_committee(title)
Expand All @@ -84,7 +98,22 @@ def scrape_event(self, url: str):
},
)

yield event
# Make sure we add any new event to the list
if existing_event is None:
self._events.append(event)

def check_for_existing_event(
self, title: str, location_name: str, start_date: datetime.datetime.date
) -> Union[Event, None]:
for event in self._events:
if (
event.name == title
and event.location["name"] == location_name
and event.start_date == start_date
):
return event

return None

# versions and media are in the 'dataModel' js variable on the page
def scrape_versions(self, event: Event, html: str):
Expand All @@ -107,4 +136,5 @@ def scrape_media(self, event: Event, html: str):
m["textTags"]["DESCRIPTION"]["text"],
m["textTags"]["URL"]["text"],
media_type="application/vnd",
on_duplicate="ignore", # we are combining links from duplicate "event" listings into one
)

0 comments on commit 9934948

Please sign in to comment.