diff --git a/scrapers/mi/events.py b/scrapers/mi/events.py
index 5f93aa98af..4228518be2 100644
--- a/scrapers/mi/events.py
+++ b/scrapers/mi/events.py
@@ -90,151 +90,9 @@ def scrape_event_page(self, url) -> Generator[Event]:
},
)
- event.dedupe_key = f"{chamber}#{title}#{where}#{when}"
+ event.dedupe_key = f"{chamber}#{title}#{where}#{when}#{status}"
yield event
def table_cell(self, header: str):
xpath = f"//div[@class='formLeft' and contains(text(),'{header}')]/following-sibling::div[@class='formRight']"
return self.current_page.xpath(f"string({xpath})").strip()
-
- # def scrape_event_page(self, url, chamber):
- # html = self.get(url).text
- # html = html.replace("", "
").replace("", "
")
- # page = lxml.html.fromstring(html)
- # trs = page.xpath("//table[@id='frg_mcommitteemeeting_MeetingTable']/tr")
- # metainf = {}
- # for tr in trs:
- # tds = tr.xpath(".//td")
- # if len(tds) <= 1:
- # continue
- # key = tds[0].text_content().strip()
- # val = tds[1]
- # metainf[key] = {"txt": val.text_content().strip(), "obj": val}
-
- # if metainf == {}:
- # return
-
- # # Wednesday, 5/16/2012 3:00 pm
- # datetime = "%s %s" % (
- # metainf["Date"]["txt"],
- # metainf["Time"]["txt"].replace(".", ""),
- # )
-
- # status = "tentative"
- # if "Cancelled" in datetime:
- # status = "cancelled"
-
- # translate = {
- # "noon": " PM",
- # "a.m.": " AM",
- # "am": " AM", # This is due to a nasty line they had.
- # "a.m": "AM", # another weird one
- # }
-
- # for t in translate:
- # if t in datetime:
- # datetime = datetime.replace(t, translate[t])
-
- # datetime = re.sub(r"\s+", " ", datetime)
-
- # for text_to_remove in [
- # "or after committees are given leave",
- # "or later immediately after committees are given leave",
- # "or later after committees are given leave by the House to meet",
- # "**Please note time**",
- # "Cancelled",
- # ]:
- # datetime = datetime.replace(text_to_remove, "").strip()
-
- # datetime = datetime.replace("p.m.", "pm")
- # datetime = datetime.replace("Noon", "pm")
- # try:
- # datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I:%M %p")
- # except ValueError:
- # datetime = dt.datetime.strptime(datetime, "%A, %m/%d/%Y %I %p")
- # where = metainf["Location"]["txt"]
- # title = metainf["Committee(s)"]["txt"] # XXX: Find a better title
-
- # if chamber == "other":
- # chamber = "joint"
-
- # event = Event(
- # name=title,
- # start_date=self._tz.localize(datetime),
- # location_name=where,
- # status=status,
- # )
- # event.dedupe_key = f"{chamber}#{title}#{where}#{self._tz.localize(datetime)}"
- # event.add_source(url)
- # event.add_source(mi_events)
-
- # chair_name = metainf["Chair"]["txt"].strip()
- # if chair_name:
- # event.add_participant(chair_name, type="legislator", note="chair")
- # else:
- # self.warning("No chair found for event '{}'".format(title))
-
- # event.add_participant(
- # metainf["Committee(s)"]["txt"], type="committee", note="host"
- # )
-
- # # The MI pages often contain broken markup for line breaks in the agenda
- # # like . This gets stripped in text_content and we lose the information
- # # needed to separate out agenda sections.
- # # So instead, pull out the raw HTML, break it, then parse it.
- # agenda = page.xpath("//td[contains(., 'Agenda')]/following-sibling::td")[0]
- # agenda_html = lxml.etree.tostring(agenda, encoding="unicode")
- # agenda_parts = re.split(r"\
\
", agenda_html, flags=re.IGNORECASE)
- # for part_html in agenda_parts:
- # if part_html == "":
- # continue
- # part = lxml.html.fromstring(part_html)
- # part_text = part.text_content().strip()
- # if part_text == "":
- # continue
- # item = event.add_agenda_item(part_text)
-
- # related_bills = part.xpath("//a[contains(@href, 'getObject')]")
- # for bill in related_bills:
- # item.add_bill(bill.text_content())
-
- # yield event
-
- # def scrape(self, chamber=None):
- # chambers = [chamber] if chamber is not None else ["upper", "lower", "other"]
- # html = self.get(mi_events).text
- # page = lxml.html.fromstring(html)
- # page.make_links_absolute(mi_events)
- # xpaths = {
- # "lower": "//span[@id='frg_mcommitteemeetings_HouseMeetingsList']",
- # "upper": "//span[@id='frg_mcommitteemeetings_SenateMeetingsList']",
- # "other": "//span[@is='frg_mcommitteemeetings_JointMeetingsList']",
- # }
-
- # if page.xpath(
- # "//span[contains(text(),'There are no House meetings scheduled')]"
- # ) and page.xpath(
- # "//span[contains(text(),'There are no Senate meetings scheduled')]"
- # ):
- # raise EmptyScrape
- # return
-
- # event_objects = set()
- # for chamber in chambers:
- # span = page.xpath(xpaths[chamber])
- # if len(span) > 0:
- # span = span[0]
- # else:
- # continue
- # events = span.xpath(".//a[contains(@href, 'committeemeeting')]")
- # for event in events:
- # url = event.attrib["href"]
- # if "doPostBack" in url:
- # continue
- # for event in self.scrape_event_page(url, chamber):
- # event_name = event.dedupe_key
- # if event_name in event_objects:
- # self.warning(f"Skipping duplicate event: {event_name}")
- # continue
- # event_objects.add(event_name)
- # yield event