diff --git a/nichtparasoup/_internals/__init__.py b/nichtparasoup/_internals/__init__.py index 5429e116..8cdc130e 100644 --- a/nichtparasoup/_internals/__init__.py +++ b/nichtparasoup/_internals/__init__.py @@ -20,14 +20,14 @@ _logger = logging.getLogger('nichtparasoup') -_LOG_TYPE = Literal['debug', 'info', 'warning', 'error', 'critical', 'log', 'exception'] +_LOG_LEVEL = Literal['debug', 'info', 'warning', 'error', 'critical', 'log', 'exception'] -def _log(type: _LOG_TYPE, message: str, *args: Any, **kwargs: Any) -> None: +def _log(level: _LOG_LEVEL, message: str, *args: Any, **kwargs: Any) -> None: if not logging.root.handlers and _logger.level == logging.NOTSET: _logger.setLevel(logging.INFO) _logger.addHandler(logging.StreamHandler()) - getattr(_logger, type)(message.rstrip(), *args, **kwargs) + getattr(_logger, level)(message.rstrip(), *args, **kwargs) def _message(message: str, color: Optional[str] = None, file: Optional[TextIO] = None) -> None: diff --git a/nichtparasoup/core/image.py b/nichtparasoup/core/image.py index d6f0b45a..c98a7949 100644 --- a/nichtparasoup/core/image.py +++ b/nichtparasoup/core/image.py @@ -1,17 +1,55 @@ -__all__ = ["Image", "ImageCollection", "ImageSource", "ImageUri"] +__all__ = ["Image", "ImageCollection"] -from typing import Any, Optional, Set +from typing import Any, Set from uuid import uuid4 ImageUri = str -ImageSource = str +SourceUri = str class Image(object): + """Describe an image - def __init__(self, uri: ImageUri, - is_generic: bool = False, source: Optional[ImageSource] = None, + `uri` + The absolute URI of the image. This basically identifies the Image and makes it unique. + + This absolute URI must include: ``scheme``, ``host``. + ``schema`` must be either 'http' or 'https' - the last one is preferred. + Optional are: ``port``, ``path``, ``query``, ``fragment``. + + `source` + The URI where did the image is originally found? + This URI can point to a ImageBoardThread, or a comment section in a Forum, or a news article... + + In the idea of fair use, it is encouraged to point to the source as good as possible. + + This absolute URI must include: ``scheme``, ``host``. + ``schema`` must be either 'http' or 'https' - the last one is preferred. + Optional are: ``port``, ``path``, ``query``, ``fragment``. + + Good examples are: + * https://www.reddit.com/r/Awww/comments/e1er0c/say_hi_to_loki_hes_just_contemplating/ + * https://giphy.com/gifs/10kABVanhwykJW + + `is_generic` + If a generic image crawler is used, its common that each image URI looks exactly the same. + To make this known, use this flag. + + `more` + A dictionary of additional information an image crawler might want to deliver. + + This dictionary's data types are intended to the basic ones: string, int, float, list, set, dict, bool, None + + Good examples are: + * image-dimensions + * author, copyright information + * valid-until + + """ + + def __init__(self, uri: ImageUri, source: SourceUri, + is_generic: bool = False, **more: Any) -> None: # pragma: no cover self.uri = uri self.source = source diff --git a/nichtparasoup/core/imagecrawler.py b/nichtparasoup/core/imagecrawler.py index 6211fa7b..8ddfb793 100644 --- a/nichtparasoup/core/imagecrawler.py +++ b/nichtparasoup/core/imagecrawler.py @@ -1,10 +1,11 @@ -__all__ = ["ImageCrawlerConfig", "BaseImageCrawler", "ImageCrawlerInfo"] +__all__ = ["ImageCrawlerConfig", "BaseImageCrawler", "ImageCrawlerInfo", "RemoteFetcher", "ImageRecognizer"] from abc import ABC, abstractmethod from http.client import HTTPResponse from re import IGNORECASE as RE_IGNORECASE, compile as re_compile from threading import Lock from typing import Any, Dict, Optional, Pattern, Tuple +from urllib.parse import urlparse from urllib.request import Request, urlopen from nichtparasoup._internals import _log @@ -28,11 +29,13 @@ class ImageCrawlerConfig(Dict[_ImageCrawlerConfigKey, Any]): class BaseImageCrawler(ABC): def __init__(self, **config: Any) -> None: # pragma: no cover - self._config = self.check_config(config) + self._config = self.check_config(config) # intended to be immutable from now on self._reset_before_next_crawl = True self._crawl_lock = Lock() - _log('debug', 'crawler initialized {}({:x}) with: {!r}'.format( - type(self).__name__, id(self), self.get_config())) + _log('debug', 'crawler initialized: {!r}'.format(self)) + + def __repr__(self) -> str: + return '<{0.__module__}.{0.__name__} {1!r}>'.format(type(self), self.get_config()) def __eq__(self, other: Any) -> bool: if type(self) is type(other): @@ -41,90 +44,130 @@ def __eq__(self, other: Any) -> bool: return False def get_config(self) -> ImageCrawlerConfig: - return ImageCrawlerConfig(self._config) # is just a shallow copy + """ + Get all *public* information from the config + + For internal access to the config using `self._config` is encouraged + """ + return ImageCrawlerConfig({k: v for (k, v) in self._config.items() if not k.startswith('_')}) def reset(self) -> None: self._reset_before_next_crawl = True - _log('debug', 'crawler reset planned {}({:x})'.format(type(self).__name__, id(self))) + _log('debug', 'crawler reset planned for {!r}'.format(self)) def crawl(self) -> ImageCollection: # pragma: no cover - debug_map = dict(type=type(self).__name__, id=id(self)) with self._crawl_lock: try: if self._reset_before_next_crawl: - _log('debug', 'crawler resetting {type}({id:x})'.format_map(debug_map)) + _log('debug', 'crawler resetting {!r}'.format(self)) self._reset() self._reset_before_next_crawl = False - _log('debug', 'crawling started {type}({id:x})'.format_map(debug_map)) + _log('debug', 'crawling started {!r}'.format(self)) crawled = self._crawl() - _log('debug', 'crawling finished {type}({id:x})'.format_map(debug_map)) + _log('debug', 'crawling finished {!r}'.format(self)) return crawled except Exception: - _log('exception', 'caught an error during crawling {type}({id:x})'.format_map(debug_map)) + _log('exception', 'caught an error during crawling {!r}'.format(self)) return ImageCollection() - _RE_IMAGE_PATH = re_compile(r'.*\.(?:jpeg|jpg|png|gif)(?:[?#].*)?$', flags=RE_IGNORECASE) # type: Pattern[str] - - @classmethod - def path_is_image(cls, uri: str) -> bool: - return cls._RE_IMAGE_PATH.match(uri) is not None - - _HEADERS_DEFAULT = { - 'User-Agent': 'NichtParasoup', - } - @classmethod - def fetch_remote_data(cls, uri: str, - timeout: float = 10.0, - headers: Optional[Dict[str, str]] = None) -> Tuple[str, str]: - _log('debug', 'fetch remote {!r} in {!r} with {!r}'.format(uri, timeout, headers)) - request = Request(uri, headers={**cls._HEADERS_DEFAULT, **(headers or dict())}) - response = urlopen(request, timeout=timeout) # type: HTTPResponse - actual_uri = response.geturl() # after following redirects ... - charset = str(response.info().get_param('charset', 'UTF-8')) - return response.read().decode(charset), actual_uri - - @staticmethod @abstractmethod - def info() -> ImageCrawlerInfo: # pragma: no cover - return ImageCrawlerInfo( - desc="Some textual description about what this ImageCrawler does.", - config=dict( - # leave the dict empty, if there is nothing to configure - param1="meaning of param1", - paramN="meaning of paramN", - ), - version='0.0.dev1', - ) + def info(cls) -> ImageCrawlerInfo: # pragma: no cover + """ + Get info of the crawler + + example implementation: + return ImageCrawlerInfo( + desc="Some textual description about what this ImageCrawler does.", + config=dict( + # leave the dict empty, if there is nothing to configure + param1="meaning of param1", + paramN="meaning of paramN", + ), + version='0.0.dev1', + ) + """ + raise NotImplementedError() - @staticmethod + @classmethod @abstractmethod - def check_config(config: Dict[Any, Any]) -> ImageCrawlerConfig: # pragma: no cover + def check_config(cls, config: Dict[Any, Any]) -> ImageCrawlerConfig: # pragma: no cover """ - this function is intended to check if a config is valid and to strip unused config. + This function is intended to check if a config is valid and to strip unused config. - when implementing: - check if any config is viable. if not raise ValueError or TypeError or KeyError or whatever Error - return the viable config for this crawler instance + When implementing: + Check if any config is viable. if not raise ValueError or TypeError or KeyError + or whatever Error. + Return the viable config for this crawler instance. - example: + Example implementation: height = config["height"] # will raise KeyError automatically if type(height) is not int: raise TypeError("height {} is not int".format(height)) if height <= 0: raise ValueError("height {} <= 0".format(width)) """ - return ImageCrawlerConfig(config) + raise NotImplementedError() @abstractmethod def _reset(self) -> None: # pragma: no cover """ - this function is intended to reset the crawler to restart at front + This function is intended to reset the crawler to restart at front """ + raise NotImplementedError() @abstractmethod def _crawl(self) -> ImageCollection: # pragma: no cover """ - this function is intended to find and fetch ImageURIs + This function is intended to find and fetch ImageURIs """ - return ImageCollection() + raise NotImplementedError() + + +class RemoteFetcher(object): + + _HEADERS_DEFAULT = { + 'User-Agent': 'NichtParasoup', + } + + def __init__(self, timeout: float = 10.0, headers: Optional[Dict[str, str]] = None) -> None: # pragma: no cover + self._timeout = timeout + self._headers = self.__class__._HEADERS_DEFAULT.copy() + if headers: + self._headers.update(headers) + + @staticmethod + def _valid_uri(uri: str) -> bool: + (scheme, _, _, _, _, _) = urlparse(uri) + return scheme in {'http', 'https'} + + def get_stream(self, uri: str) -> Tuple[HTTPResponse, str]: + if not self._valid_uri(uri): + raise ValueError('not remote: ' + uri) + _log('debug', 'fetch remote {!r} in {}s with {!r}'.format( + uri, self._timeout, self._headers)) + request = Request(uri, headers=self._headers) + try: + response = urlopen(request, timeout=self._timeout) # type: HTTPResponse + except BaseException as e: + _log('debug', 'caught error on fetch remote {!r}'.format(uri), exc_info=True) + raise e + actual_uri = response.geturl() # after following redirects ... + return response, actual_uri + + def get_bytes(self, uri: str) -> Tuple[bytes, str]: + response, actual_uri = self.get_stream(uri) + return response.read(), actual_uri + + def get_string(self, uri: str, charset_fallback: str = 'UTF-8') -> Tuple[str, str]: + response, actual_uri = self.get_stream(uri) + charset = str(response.info().get_param('charset', charset_fallback)) + return response.read().decode(charset), actual_uri + + +class ImageRecognizer(object): + + _PATH_RE = re_compile(r'.+\.(?:jpeg|jpg|png|gif|svg)(?:[?#].*)?$', flags=RE_IGNORECASE) # type: Pattern[str] + + def path_is_image(self, uri: str) -> bool: + return self._PATH_RE.match(uri) is not None diff --git a/nichtparasoup/core/server.py b/nichtparasoup/core/server.py index 44fe0c5e..81d8682e 100644 --- a/nichtparasoup/core/server.py +++ b/nichtparasoup/core/server.py @@ -55,9 +55,7 @@ def get_image(self) -> Optional[Dict[str, Any]]: def _log_refill_crawler(crawler: Crawler, refilled: int) -> None: # must be compatible to nichtparasoup.core._OnFill if refilled > 0: - _log('info', "refilled via {}({:x}) by {}".format( - type(crawler.imagecrawler).__name__, id(crawler.imagecrawler), - refilled)) + _log('info', "refilled by {} via {!r}".format(refilled, crawler.imagecrawler)) def refill(self) -> Dict[str, bool]: with self._locks.refill: diff --git a/nichtparasoup/imagecrawler/dummy.py b/nichtparasoup/imagecrawler/dummy.py index ac62fd1c..e39b8d27 100644 --- a/nichtparasoup/imagecrawler/dummy.py +++ b/nichtparasoup/imagecrawler/dummy.py @@ -8,8 +8,8 @@ class Dummy(BaseImageCrawler): - @staticmethod - def info() -> ImageCrawlerInfo: + @classmethod + def info(cls) -> ImageCrawlerInfo: from nichtparasoup import __version__ return ImageCrawlerInfo( desc='"Finds" the same image ... again ... and again.', @@ -19,8 +19,8 @@ def info() -> ImageCrawlerInfo: version=__version__, ) - @staticmethod - def check_config(config: Dict[Any, Any]) -> ImageCrawlerConfig: + @classmethod + def check_config(cls, config: Dict[Any, Any]) -> ImageCrawlerConfig: image_uri = config["image_uri"] if type(image_uri) is not str: raise TypeError("image_uri {!r} is not str".format(image_uri)) @@ -37,7 +37,7 @@ def _crawl(self) -> ImageCollection: images = ImageCollection() config = self.get_config() images.add(Image( - config["image_uri"], + config["image_uri"], config["image_uri"], is_generic=True, this_is_a_dummy=True, )) diff --git a/nichtparasoup/imagecrawler/picsum.py b/nichtparasoup/imagecrawler/picsum.py index 9cfb892f..6951d852 100644 --- a/nichtparasoup/imagecrawler/picsum.py +++ b/nichtparasoup/imagecrawler/picsum.py @@ -1,17 +1,16 @@ from typing import Any, Dict -from nichtparasoup.core.image import Image, ImageCollection, ImageUri +from nichtparasoup.core.image import Image, ImageCollection from nichtparasoup.core.imagecrawler import BaseImageCrawler, ImageCrawlerConfig, ImageCrawlerInfo __all__ = ["Picsum"] class Picsum(BaseImageCrawler): - _bunch = 10 - @staticmethod - def info() -> ImageCrawlerInfo: + @classmethod + def info(cls) -> ImageCrawlerInfo: from nichtparasoup import __version__ return ImageCrawlerInfo( desc='Find images from https://picsum.photos', @@ -22,8 +21,8 @@ def info() -> ImageCrawlerInfo: version=__version__, ) - @staticmethod - def check_config(config: Dict[Any, Any]) -> ImageCrawlerConfig: + @classmethod + def check_config(cls, config: Dict[Any, Any]) -> ImageCrawlerConfig: width = config["width"] height = config["height"] if type(width) is not int: @@ -40,7 +39,7 @@ def check_config(config: Dict[Any, Any]) -> ImageCrawlerConfig: ) @staticmethod - def _get_image_uri(width: int, height: int) -> ImageUri: + def _get_image_uri(width: int, height: int) -> str: return "https://picsum.photos/{}/{}".format(width, height) def _reset(self) -> None: # pragma: no cover @@ -50,8 +49,9 @@ def _crawl(self) -> ImageCollection: images = ImageCollection() config = self.get_config() for _ in range(0, self._bunch): + uri = self._get_image_uri(**config) images.add(Image( - self._get_image_uri(**config), + uri, uri, is_generic=True, )) return images diff --git a/nichtparasoup/imagecrawler/reddit.py b/nichtparasoup/imagecrawler/reddit.py index 47b1cae7..7e0d4a64 100644 --- a/nichtparasoup/imagecrawler/reddit.py +++ b/nichtparasoup/imagecrawler/reddit.py @@ -5,7 +5,9 @@ from urllib.parse import quote_plus as url_quote, urljoin from nichtparasoup.core.image import Image, ImageCollection -from nichtparasoup.core.imagecrawler import BaseImageCrawler, ImageCrawlerConfig, ImageCrawlerInfo +from nichtparasoup.core.imagecrawler import ( + BaseImageCrawler, ImageCrawlerConfig, ImageCrawlerInfo, ImageRecognizer, RemoteFetcher, +) class Reddit(BaseImageCrawler): @@ -13,11 +15,13 @@ class Reddit(BaseImageCrawler): def __init__(self, **config: Any) -> None: # pragma: no cover super().__init__(**config) self._uri_base = 'https://www.reddit.com/r/{}.json?after='.format( - url_quote(self.get_config()['subreddit'])) + url_quote(self._config['subreddit'])) self._after = None # type: Optional[str] + self._remote_fetcher = RemoteFetcher() + self._image_recognizer = ImageRecognizer() - @staticmethod - def info() -> ImageCrawlerInfo: + @classmethod + def info(cls) -> ImageCrawlerInfo: from nichtparasoup import __version__ return ImageCrawlerInfo( desc='A Crawler for an arbitrary SubReddit of https://www.reddit.com/', @@ -27,8 +31,8 @@ def info() -> ImageCrawlerInfo: version=__version__, ) - @staticmethod - def check_config(config: Dict[Any, Any]) -> ImageCrawlerConfig: + @classmethod + def check_config(cls, config: Dict[Any, Any]) -> ImageCrawlerConfig: subreddit = config["subreddit"] if type(subreddit) is not str: raise TypeError("subreddit {!r} is not str".format(subreddit)) @@ -44,9 +48,9 @@ def _reset(self) -> None: def _crawl(self) -> ImageCollection: images = ImageCollection() - listing_data, uri = self.fetch_remote_data(self._get_uri()) - listing = json_loads(listing_data) - del listing_data # free up some ram + listing_string, uri = self._remote_fetcher.get_string(self._get_uri(self._after)) + listing = json_loads(listing_string) + del listing_string # free up some ram for child in listing['data']['children']: image = self._get_image(child['data']) if image: @@ -58,12 +62,9 @@ def _crawl(self) -> ImageCollection: self._after = listing['data']['after'] return images - def _get_uri(self) -> str: - return self._uri_base + (url_quote(self._after) if self._after else '') + def _get_uri(self, after: Optional[str]) -> str: + return self._uri_base + (url_quote(after) if after else '') def _get_image(self, data: Dict[str, Any]) -> Optional[str]: uri = data.get('url') # type: Optional[str] - if uri: - if self.path_is_image(uri): - return uri - return None + return uri if uri and self._image_recognizer.path_is_image(uri) else None diff --git a/setup.py b/setup.py index e674e09d..4430ce32 100755 --- a/setup.py +++ b/setup.py @@ -65,6 +65,8 @@ "coverage", "pytest", "ddt", + # "flake8-builtins", # nice in general, but seams not bug-free, yet. + # "lake8-docstrings", not in use, until pluggable ImageCrawlers are implemented. ], ) diff --git a/testing/test_core/mockable_crawler.py b/testing/test_core/mockable_crawler.py index abb9208d..dc1621c1 100644 --- a/testing/test_core/mockable_crawler.py +++ b/testing/test_core/mockable_crawler.py @@ -33,5 +33,5 @@ class Random3Crawler(_LoggingCrawler): def crawl(self) -> int: super().crawl() for _ in range(3): - self.images.add(Image('test', is_generic=True)) + self.images.add(Image('test', 'test', is_generic=True)) return 3 diff --git a/testing/test_core/test_baseimagecrawler.py b/testing/test_core/test_baseimagecrawler.py index f92ff4eb..c99e371a 100644 --- a/testing/test_core/test_baseimagecrawler.py +++ b/testing/test_core/test_baseimagecrawler.py @@ -1,7 +1,5 @@ import unittest -from nichtparasoup.core.imagecrawler import BaseImageCrawler - from .mockable_imagecrawler import MockableImageCrawler, YetAnotherImageCrawler @@ -62,15 +60,36 @@ def test_reset_released(self) -> None: self.assertFalse(c._reset_before_next_crawl) -class BaseImageCrawlerPathIsImageTest(unittest.TestCase): +class BaseImageCrawlerGetConfigTest(unittest.TestCase): + + def test_public(self) -> None: + # arrange + c = MockableImageCrawler(foo='bar') + # act + config = c.get_config() + # assert + self.assertDictEqual(dict(foo='bar'), config) - def test_path_is_image(self) -> None: + def test_empty_key(self) -> None: # arrange - image_file_extensions = ('jpg', 'jpeg', 'gif', 'png') - # act & assert - for image_file_extension in image_file_extensions: - image_file_path = '.' + image_file_extension - self.assertTrue(BaseImageCrawler.path_is_image(image_file_path)) - self.assertTrue(BaseImageCrawler.path_is_image(image_file_path + '?foo')) - self.assertTrue(BaseImageCrawler.path_is_image(image_file_path + '#bar')) - self.assertTrue(BaseImageCrawler.path_is_image(image_file_path + '?foo#bar')) + c = MockableImageCrawler(**{'': 'bar'}) + # act + config = c.get_config() + # assert + self.assertDictEqual({'': 'bar'}, config) + + def test_protected(self) -> None: + # arrange + c = MockableImageCrawler(_foo='bar') + # act + config = c.get_config() + # assert + self.assertDictEqual(dict(), config) + + def test_private(self) -> None: + # arrange + c = MockableImageCrawler(__foo='bar') + # act + config = c.get_config() + # assert + self.assertDictEqual(dict(), config) diff --git a/testing/test_core/test_crawler.py b/testing/test_core/test_crawler.py index 0716a6b6..9aec2242 100644 --- a/testing/test_core/test_crawler.py +++ b/testing/test_core/test_crawler.py @@ -150,7 +150,7 @@ def test_function(self) -> None: class CrawlerCrawlCase(unittest.TestCase): def setUp(self) -> None: - self.images = ImageCollection({Image('1'), Image('2')}) + self.images = ImageCollection({Image('1', 'test'), Image('2', 'test')}) self.imagecrawler = MockableImageCrawler() self.imagecrawler.crawl = MagicMock(return_value=self.images) # type: ignore self.crawler = Crawler(self.imagecrawler, 1) diff --git a/testing/test_core/test_image.py b/testing/test_core/test_image.py index 0fa9b962..35763310 100644 --- a/testing/test_core/test_image.py +++ b/testing/test_core/test_image.py @@ -8,21 +8,21 @@ class ImageTest(unittest.TestCase): def test_uri_is_hash(self) -> None: # arrange uri = 'test' - image = Image(uri) + image = Image(uri, 'test') # assert self.assertEqual(hash(uri), hash(image)) def test_uri_makes_equal(self) -> None: # arrange uri = 'test' - image1 = Image(uri) - image2 = Image(uri) + image1 = Image(uri, 'test1') + image2 = Image(uri, 'test2') # assert self.assertEqual(image1, image2) def test_unequal_other_types(self) -> None: # arrange - image = Image("testA") + image = Image("testA", 'test') other_types = [None, True, 23, 4.2, "", [], (), {}, self] # type: ignore # assert for other_type in other_types: @@ -30,10 +30,10 @@ def test_unequal_other_types(self) -> None: def test_equal(self) -> None: # arrange - image1 = Image("testA") - image2 = Image("testA") - image3 = Image("testB", is_generic=True) - image4 = Image("testB", is_generic=True) + image1 = Image("testA", 'testA') + image2 = Image("testA", 'testA') + image3 = Image("testB", 'testB', is_generic=True) + image4 = Image("testB", 'testB', is_generic=True) # assert self.assertEqual(image1, image1) self.assertEqual(image1, image2) @@ -42,8 +42,8 @@ def test_equal(self) -> None: def test_remove_nongeneric_from_container(self) -> None: # arrange - image1 = Image("testA", is_generic=True) - image2 = Image("testA", is_generic=True) + image1 = Image("testA", 'testA', is_generic=True) + image2 = Image("testA", 'testA', is_generic=True) images = ImageCollection() images.add(image1) # act diff --git a/testing/test_core/test_imagerecognizer.py b/testing/test_core/test_imagerecognizer.py new file mode 100644 index 00000000..178a7a6b --- /dev/null +++ b/testing/test_core/test_imagerecognizer.py @@ -0,0 +1,30 @@ +import unittest + +from nichtparasoup.core.imagecrawler import ImageRecognizer + + +class BaseImageCrawlerPathIsImageTest(unittest.TestCase): + + def test_path_is_image(self) -> None: + # arrange + recognizer = ImageRecognizer() + image_file_extensions = ('jpg', 'jpeg', 'gif', 'png', 'svg') + # act & assert + for image_file_extension in image_file_extensions: + image_file_path = 'foo.' + image_file_extension + self.assertTrue(recognizer.path_is_image(image_file_path), image_file_path) + self.assertTrue(recognizer.path_is_image(image_file_path + '?foo'), image_file_path) + self.assertTrue(recognizer.path_is_image(image_file_path + '#bar'), image_file_path) + self.assertTrue(recognizer.path_is_image(image_file_path + '?foo#bar'), image_file_path) + + def test_path_is_not_image(self) -> None: + # arrange + recognizer = ImageRecognizer() + not_image_file_extensions = ('', '/', '.html', '.js', '.css') + # act & assert + for not_image_file_extension in not_image_file_extensions: + not_image_file_path = 'foo' + not_image_file_extension + self.assertFalse(recognizer.path_is_image(not_image_file_path), not_image_file_path) + self.assertFalse(recognizer.path_is_image(not_image_file_path + '?foo'), not_image_file_path) + self.assertFalse(recognizer.path_is_image(not_image_file_path + '#bar'), not_image_file_path) + self.assertFalse(recognizer.path_is_image(not_image_file_path + '?foo#bar'), not_image_file_path) diff --git a/testing/test_core/test_npcore.py b/testing/test_core/test_npcore.py index 35c56b32..1e8c1654 100644 --- a/testing/test_core/test_npcore.py +++ b/testing/test_core/test_npcore.py @@ -10,8 +10,8 @@ class NPCoreTest(unittest.TestCase): def test__is_image_not_in_blacklist(self) -> None: # arrange - image1 = Image("test1") - image2 = Image("test2") + image1 = Image("test1", 'test') + image2 = Image("test2", 'test') core = NPCore() # act core.blacklist.add(image1.uri) @@ -21,8 +21,8 @@ def test__is_image_not_in_blacklist(self) -> None: def test__add_image_to_blacklist(self) -> None: # arrange - image1 = Image("test1") - image2 = Image("test2") + image1 = Image("test1", 'test') + image2 = Image("test2", 'test') core = NPCore() # act core._add_image_to_blacklist(image1) diff --git a/testing/test_imagecrawler/__init__.py b/testing/test_imagecrawler/__init__.py new file mode 100644 index 00000000..5a33673b --- /dev/null +++ b/testing/test_imagecrawler/__init__.py @@ -0,0 +1,47 @@ +from collections import OrderedDict +from http.client import HTTPResponse +from os.path import join as path_join +from typing import Dict, Optional, Tuple +from urllib.parse import parse_qs, urlencode, urlparse, urlunparse + +from nichtparasoup.core.imagecrawler import RemoteFetcher + + +class _FileFetcher(RemoteFetcher): + + def __init__(self, known_files: Dict[str, str], base_dir: Optional[str] = None) -> None: + super().__init__() + self._known_files = {self.__class__._uri_sort_query(k): v for k, v in known_files.items()} + self._dir = base_dir + + @classmethod + def _uri_sort_query(cls, uri: str) -> str: + scheme, netloc, path, params, query, fragment = urlparse(uri) + if query == '': + query_sorted = query + else: + query_dict = parse_qs(query, keep_blank_values=True) + query_dict_sorted = OrderedDict((k, query_dict[k]) for k in sorted(query_dict)) + query_sorted = urlencode(query_dict_sorted, doseq=True) + uri_sorted = urlunparse((scheme, netloc, path, params, query_sorted, fragment)) + return uri_sorted + + def _get_file_uri(self, uri: str) -> str: + _, _, url, params, query, fragment = urlparse(uri) + uri_abs = urlunparse(('', '', url, params, query, fragment)) + uri_sorted = self.__class__._uri_sort_query(uri_abs) + file_known = self._known_files.get(uri_sorted) + if not file_known: + raise FileNotFoundError('uri unexpected: {}'.format(uri_sorted)) + if self._dir: + file_known = path_join(self._dir, file_known) + return 'file://' + file_known + + @staticmethod + def _valid_uri(uri: str) -> bool: + scheme, _, _, _, _, _ = urlparse(uri) + return scheme == 'file' + + def get_stream(self, uri: str) -> Tuple[HTTPResponse, str]: + stream, _ = super().get_stream(self._get_file_uri(uri)) + return stream, uri diff --git a/testing/test_imagecrawler/test__filefetcher.py b/testing/test_imagecrawler/test__filefetcher.py new file mode 100644 index 00000000..03e82190 --- /dev/null +++ b/testing/test_imagecrawler/test__filefetcher.py @@ -0,0 +1,54 @@ +import unittest + +from . import _FileFetcher + + +class FileFetcherTest(unittest.TestCase): + + def test__uri_sort_query__dings(self) -> None: + self.assertEqual( + 'https://asdf', + _FileFetcher._uri_sort_query('https://asdf') + ) + + def test__uri_sort_query__path(self) -> None: + self.assertEqual( + 'https://as/df', + _FileFetcher._uri_sort_query('https://as/df') + ) + + def test__uri_sort_query__fragment(self) -> None: + self.assertEqual( + 'https://asdf#foo', + _FileFetcher._uri_sort_query('https://asdf#foo') + ) + + def test__uri_sort_query__no_query(self) -> None: + self.assertEqual( + 'https://asdf', + _FileFetcher._uri_sort_query('https://asdf?') + ) + + def test__uri_sort_query__empty_query(self) -> None: + self.assertEqual( + 'https://asdf?foo=', + _FileFetcher._uri_sort_query('https://asdf?foo=') + ) + + def test__uri_sort_query__query(self) -> None: + self.assertEqual( + 'https://asdf?foo=1', + _FileFetcher._uri_sort_query('https://asdf?foo=1') + ) + + def test__uri_sort_query__query_sorted(self) -> None: + self.assertEqual( + 'https://asdf?bar=1&foo=2', + _FileFetcher._uri_sort_query('https://asdf?bar=1&foo=2') + ) + + def test__uri_sort_query__unsorted(self) -> None: + self.assertEqual( + 'https://asdf?bar=2&foo=1', + _FileFetcher._uri_sort_query('https://asdf?foo=1&bar=2') + ) diff --git a/testing/test_imagecrawler/test_reddit.py b/testing/test_imagecrawler/test_reddit.py index 50c390c2..aff1958a 100644 --- a/testing/test_imagecrawler/test_reddit.py +++ b/testing/test_imagecrawler/test_reddit.py @@ -1,10 +1,12 @@ import unittest -from typing import Any, Tuple +from os.path import dirname, join as path_join from nichtparasoup.core.image import Image, ImageCollection from nichtparasoup.imagecrawler import get_class as get_imagecrawler_class from nichtparasoup.imagecrawler.reddit import Reddit +from . import _FileFetcher + _reddit_right_config = dict(subreddit='aww') @@ -64,7 +66,7 @@ def test__build_uri_at_front(self) -> None: # arrange crawler = Reddit(subreddit='foo') # act - uri = crawler._get_uri() + uri = crawler._get_uri(None) # assert self.assertEqual(uri, 'https://www.reddit.com/r/foo.json?after=') @@ -72,25 +74,23 @@ def test__build_uri_at_front__escape(self) -> None: # arrange crawler = Reddit(subreddit='foo/bar bazz') # act - uri = crawler._get_uri() + uri = crawler._get_uri(None) # assert self.assertEqual(uri, 'https://www.reddit.com/r/foo%2Fbar+bazz.json?after=') def test__build_uri_at_after(self) -> None: # arrange crawler = Reddit(subreddit='test') - crawler._after = 'foobar' # act - uri = crawler._get_uri() + uri = crawler._get_uri('foobar') # assert self.assertEqual(uri, 'https://www.reddit.com/r/test.json?after=foobar') def test__build_uri_at_after__escape(self) -> None: # arrange crawler = Reddit(subreddit='test') - crawler._after = 'foo/bar bazz' # act - uri = crawler._get_uri() + uri = crawler._get_uri('foo/bar bazz') # assert self.assertEqual(uri, 'https://www.reddit.com/r/test.json?after=foo%2Fbar+bazz') @@ -107,20 +107,20 @@ def test_reset_done(self) -> None: self.assertIsNone(crawler._after) +_FILE_FETCHER = _FileFetcher({ # relative to "../testdata_instagram" + '/r/aww.json?after=': 'aww.json', +}, base_dir=path_join(dirname(__file__), 'testdata_reddit')) + + class RedditCrawlTest(unittest.TestCase): def setUp(self) -> None: self.crawler = Reddit(subreddit='aww') - self.crawler.fetch_remote_data = self.fetch_aww_data # type: ignore + self.crawler._remote_fetcher = _FILE_FETCHER def tearDown(self) -> None: del self.crawler - @staticmethod - def fetch_aww_data(uri: str, *_: Any, **__: Any) -> Tuple[str, str]: - from os.path import join, dirname - return open(join(dirname(__file__), 'testdata_reddit', 'aww.json')).read(), uri - def test_crawl(self) -> None: # arrange expected_after = 't3_dqx42l' @@ -159,7 +159,7 @@ def test_crawl(self) -> None: for expected_image in expected_images: for image in images: if image == expected_image: - # sources are invalid for quality, need to be checked manually + # sources are irrelevant for equality, need to be checked manually self.assertEqual(image.source, expected_image.source)