diff --git a/catalog/common/sites.py b/catalog/common/sites.py index 1777864b..ac4c8a3d 100644 --- a/catalog/common/sites.py +++ b/catalog/common/sites.py @@ -14,6 +14,7 @@ import django_rq import requests +from validators import url as url_validate from .models import ExternalResource, IdealIdTypes, IdType, Item, SiteName @@ -283,7 +284,9 @@ def get_site_cls_by_id_type(typ: str) -> AbstractSite: @staticmethod def get_site_by_url(url: str) -> AbstractSite | None: - if not url: + if not url or not url_validate( + url, skip_ipv6_addr=True, skip_ipv4_addr=True, may_have_port=False + ): return None cls = next( filter(lambda p: p.validate_url(url), SiteManager.registry.values()), None diff --git a/catalog/sites/goodreads.py b/catalog/sites/goodreads.py index 761ebdda..36e1074e 100644 --- a/catalog/sites/goodreads.py +++ b/catalog/sites/goodreads.py @@ -36,8 +36,8 @@ class Goodreads(AbstractSite): WIKI_PROPERTY_ID = "P2968" DEFAULT_MODEL = Edition URL_PATTERNS = [ - r".+goodreads.com/.*book/show/(\d+)", - r".+goodreads.com/.*book/(\d+)", + r".+goodreads\.com/.*book/show/(\d+)", + r".+goodreads\.com/.*book/(\d+)", ] @classmethod @@ -125,7 +125,7 @@ class Goodreads_Work(AbstractSite): ID_TYPE = IdType.Goodreads_Work WIKI_PROPERTY_ID = "" DEFAULT_MODEL = Work - URL_PATTERNS = [r".+goodreads.com/work/editions/(\d+)"] + URL_PATTERNS = [r".+goodreads\.com/work/editions/(\d+)"] @classmethod def id_to_url(cls, id_value): diff --git a/catalog/tv/tests.py b/catalog/tv/tests.py index 1549c634..35420769 100644 --- a/catalog/tv/tests.py +++ b/catalog/tv/tests.py @@ -158,7 +158,7 @@ def test_miniseries(self): @use_local_response def test_tvspecial(self): url1 = "https://www.themoviedb.org/movie/282758-doctor-who-the-runaway-bride" - url2 = "hhttps://www.imdb.com/title/tt0827573/" + url2 = "https://www.imdb.com/title/tt0827573/" url3 = "https://movie.douban.com/subject/4296866/" p1 = SiteManager.get_site_by_url(url1).get_resource_ready() p2 = SiteManager.get_site_by_url(url2).get_resource_ready() diff --git a/journal/importers/goodreads.py b/journal/importers/goodreads.py index 2bae5c00..98fc7fcf 100644 --- a/journal/importers/goodreads.py +++ b/journal/importers/goodreads.py @@ -12,9 +12,9 @@ from catalog.models import * from journal.models import * -re_list = r"^https://www.goodreads.com/list/show/\d+" -re_shelf = r"^https://www.goodreads.com/review/list/\d+[^?]*\?shelf=[^&]+" -re_profile = r"^https://www.goodreads.com/user/show/(\d+)" +re_list = r"^https://www\.goodreads\.com/list/show/\d+" +re_shelf = r"^https://www\.goodreads\.com/review/list/\d+[^?]*\?shelf=[^&]+" +re_profile = r"^https://www\.goodreads\.com/user/show/(\d+)" gr_rating = { "did not like it": 2, "it was ok": 4, diff --git a/requirements.txt b/requirements.txt index 0c6f7243..7b3c46cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -43,3 +43,4 @@ setproctitle tqdm typesense urlman +validators