From c7777ffad187039ea4f8a39cc5d837d43828b9fd Mon Sep 17 00:00:00 2001 From: Jesse Mortenson Date: Mon, 9 Dec 2024 17:39:43 -0600 Subject: [PATCH] MT: events: fix more duplicates and ignore test event --- scrapers/mt/events.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/scrapers/mt/events.py b/scrapers/mt/events.py index ab17a4c611..1fdf406ab3 100644 --- a/scrapers/mt/events.py +++ b/scrapers/mt/events.py @@ -56,8 +56,17 @@ def scrape_event(self, url: str): page = lxml.html.fromstring(html) page.make_links_absolute(url) - title = page.xpath("//span[@class='headerTitle']")[0].text_content() - location = page.xpath("//span[@id='location']")[0].text_content() + title = page.xpath("//span[@class='headerTitle']")[0].text_content().strip() + location = page.xpath("//span[@id='location']")[0].text_content().strip() + + # handle edge case where event is named simply "Other" + # append the location name to force it into not being a duplicate + if title.lower() == "other": + title = f"{title} - {location}" + + # handle edge case of "test" event, just ignore that + if title.lower() == "test": + return if location.lower()[0:4] == "room": location = f"{location}, 1301 E 6th Ave, Helena, MT 59601" @@ -128,6 +137,9 @@ def scrape_versions(self, event: Event, html: str): ) def scrape_media(self, event: Event, html: str): + # MT has livestream archives available as m3u8 files + # these can be played only by certain players, for example: + # https://livepush.io/hlsplayer/index.html matches = re.search(r"Media:\s?(.*),", html) media = json.loads(matches.group(1)) if "children" in media and media["children"] is not None: