From cac4fbf3a111efe2f049c9c3f63f68cecdd733c2 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 16 Jul 2024 00:51:05 -0400 Subject: [PATCH] more data checks in scrapers --- catalog/sites/bangumi.py | 4 +++- catalog/sites/bgg.py | 4 +++- catalog/sites/douban_book.py | 2 +- catalog/sites/douban_game.py | 4 +++- catalog/sites/douban_movie.py | 4 +++- catalog/sites/douban_music.py | 2 +- catalog/sites/goodreads.py | 4 +++- catalog/sites/rss.py | 6 ++++-- 8 files changed, 21 insertions(+), 9 deletions(-) diff --git a/catalog/sites/bangumi.py b/catalog/sites/bangumi.py index fba90e03..3b348e99 100644 --- a/catalog/sites/bangumi.py +++ b/catalog/sites/bangumi.py @@ -106,7 +106,9 @@ def scrape(self): [title] + (other_title or []) + ([orig_title] if orig_title else []) ) localized_title = [{"lang": detect_language(t), "text": t} for t in titles] - localized_desc = [{"lang": detect_language(brief), "text": brief}] + localized_desc = ( + [{"lang": detect_language(brief), "text": brief}] if brief else [] + ) data = { "localized_title": localized_title, "localized_description": localized_desc, diff --git a/catalog/sites/bgg.py b/catalog/sites/bgg.py index 8abcbc48..9114aa1e 100644 --- a/catalog/sites/bgg.py +++ b/catalog/sites/bgg.py @@ -56,7 +56,9 @@ def scrape(self): pd = ResourceContent( metadata={ "localized_title": localized_title, - "localized_description": [{"lang": "en", "text": brief}], + "localized_description": ( + [{"lang": "en", "text": brief}] if brief else [] + ), "title": title, "other_title": other_title, "genre": category, diff --git a/catalog/sites/douban_book.py b/catalog/sites/douban_book.py index 789f4a3e..b007b396 100644 --- a/catalog/sites/douban_book.py +++ b/catalog/sites/douban_book.py @@ -190,7 +190,7 @@ def scrape(self): "subtitle": subtitle, "localized_title": [{"lang": lang, "text": title}], "localized_subtitle": [{"lang": lang, "text": subtitle}], - "localized_description": [{"lang": lang, "text": brief}], + "localized_description": [{"lang": lang, "text": brief}] if brief else [], "orig_title": orig_title, "author": authors, "translator": translators, diff --git a/catalog/sites/douban_game.py b/catalog/sites/douban_game.py index b15f65f0..eb086526 100644 --- a/catalog/sites/douban_game.py +++ b/catalog/sites/douban_game.py @@ -92,7 +92,9 @@ def scrape(self): titles = uniq([title] + other_title + ([orig_title] if orig_title else [])) localized_title = [{"lang": detect_language(t), "text": t} for t in titles] - localized_desc = [{"lang": detect_language(brief), "text": brief}] + localized_desc = ( + [{"lang": detect_language(brief), "text": brief}] if brief else [] + ) pd = ResourceContent( metadata={ diff --git a/catalog/sites/douban_movie.py b/catalog/sites/douban_movie.py index 630b7206..153e76fd 100644 --- a/catalog/sites/douban_movie.py +++ b/catalog/sites/douban_movie.py @@ -212,7 +212,9 @@ def scrape(self): + (other_title if other_title else []) ) localized_title = [{"lang": detect_language(t), "text": t} for t in titles] - localized_desc = [{"lang": detect_language(brief), "text": brief}] + localized_desc = ( + [{"lang": detect_language(brief), "text": brief}] if brief else [] + ) pd = ResourceContent( metadata={ "title": title, diff --git a/catalog/sites/douban_music.py b/catalog/sites/douban_music.py index 43703890..d458ab90 100644 --- a/catalog/sites/douban_music.py +++ b/catalog/sites/douban_music.py @@ -90,7 +90,7 @@ def scrape(self): data = { "title": title, "localized_title": localized_title, - "localized_description": [{"lang": lang, "text": brief}], + "localized_description": [{"lang": lang, "text": brief}] if brief else [], "artist": artist, "genre": genre, "release_date": release_date, diff --git a/catalog/sites/goodreads.py b/catalog/sites/goodreads.py index bba40784..17ee841d 100644 --- a/catalog/sites/goodreads.py +++ b/catalog/sites/goodreads.py @@ -73,7 +73,9 @@ def scrape(self, response=None): lang = detect_language(b["title"] + " " + (b["description"] or "")) data["localized_title"] = [{"lang": lang, "text": b["title"]}] data["localized_subtitle"] = [] # Goodreads does not support subtitle - data["localized_description"] = [{"lang": lang, "text": b["description"]}] + data["localized_description"] = ( + [{"lang": lang, "text": b["description"]}] if b["description"] else [] + ) if data["brief"]: data["brief"] = re.sub( diff --git a/catalog/sites/rss.py b/catalog/sites/rss.py index d933d6e1..95d5fcbd 100644 --- a/catalog/sites/rss.py +++ b/catalog/sites/rss.py @@ -88,7 +88,9 @@ def scrape(self): feed = self.parse_feed_from_url(self.url) if not feed: raise ValueError(f"no feed avaialble in {self.url}") - title = feed["title"] + title = feed["title"].strip() + if not title: + raise ParseError(self, "title") desc = html_to_text(feed["description"]) lang = detect_language(title + " " + desc) pd = ResourceContent( @@ -96,7 +98,7 @@ def scrape(self): "title": title, "brief": desc, "localized_title": [{"lang": lang, "text": title}], - "localized_description": [{"lang": lang, "text": desc}], + "localized_description": [{"lang": lang, "text": desc}] if desc else [], "host": ( [feed.get("itunes_author")] if feed.get("itunes_author") else [] ),