2.0 dev imagecrawler revisit (#164)

* image-source became mandatory * encouraged internal use of `_config` over `get_config()` for ImageCrawlers * `RemoteFetcher` moved out of `BaseImageCrawler` * `ImageRecognizer` moved out of `BaseImageCrawler` * image-uri and -source source revisited, internal renames * revisited BaseImageCrawler * imagecrawler repr revisited * used startswith instead of str[a:o]
k4cg · Nov 25, 2019 · ba7c9c8 · ba7c9c8
1 parent d501a09
commit ba7c9c8
Show file tree

Hide file tree

Showing 17 changed files with 366 additions and 134 deletions.
diff --git a/nichtparasoup/_internals/__init__.py b/nichtparasoup/_internals/__init__.py
@@ -20,14 +20,14 @@
 
 _logger = logging.getLogger('nichtparasoup')
 
-_LOG_TYPE = Literal['debug', 'info', 'warning', 'error', 'critical', 'log', 'exception']
+_LOG_LEVEL = Literal['debug', 'info', 'warning', 'error', 'critical', 'log', 'exception']
 
 
-def _log(type: _LOG_TYPE, message: str, *args: Any, **kwargs: Any) -> None:
+def _log(level: _LOG_LEVEL, message: str, *args: Any, **kwargs: Any) -> None:
     if not logging.root.handlers and _logger.level == logging.NOTSET:
         _logger.setLevel(logging.INFO)
         _logger.addHandler(logging.StreamHandler())
-    getattr(_logger, type)(message.rstrip(), *args, **kwargs)
+    getattr(_logger, level)(message.rstrip(), *args, **kwargs)
 
 
 def _message(message: str, color: Optional[str] = None, file: Optional[TextIO] = None) -> None:

diff --git a/nichtparasoup/core/image.py b/nichtparasoup/core/image.py
@@ -1,17 +1,55 @@
-__all__ = ["Image", "ImageCollection", "ImageSource", "ImageUri"]
+__all__ = ["Image", "ImageCollection"]
 
-from typing import Any, Optional, Set
+from typing import Any, Set
 from uuid import uuid4
 
 ImageUri = str
 
-ImageSource = str
+SourceUri = str
 
 
 class Image(object):
+    """Describe an image
 
-    def __init__(self, uri: ImageUri,
-                 is_generic: bool = False, source: Optional[ImageSource] = None,
+    `uri`
+        The absolute URI of the image. This basically identifies the Image and makes it unique.
+
+        This absolute URI must include: ``scheme``, ``host``.
+            ``schema`` must be either 'http' or 'https' - the last one is preferred.
+        Optional are: ``port``, ``path``, ``query``, ``fragment``.
+
+    `source`
+        The URI where did the image is originally found?
+        This URI can point to a ImageBoardThread, or a comment section in a Forum, or a news article...
+
+        In the idea of fair use, it is encouraged to point to the source as good as possible.
+
+        This absolute URI must include: ``scheme``, ``host``.
+            ``schema`` must be either 'http' or 'https' - the last one is preferred.
+        Optional are: ``port``, ``path``, ``query``, ``fragment``.
+
+        Good examples are:
+            * https://www.reddit.com/r/Awww/comments/e1er0c/say_hi_to_loki_hes_just_contemplating/
+            * https://giphy.com/gifs/10kABVanhwykJW
+
+    `is_generic`
+        If a generic image crawler is used, its common that each image URI looks exactly the same.
+        To make this known, use this flag.
+
+    `more`
+        A dictionary of additional information an image crawler might want to deliver.
+
+        This dictionary's data types are intended to the basic ones: string, int, float, list, set, dict, bool, None
+
+        Good examples are:
+            * image-dimensions
+            * author, copyright information
+            * valid-until
+
+    """
+
+    def __init__(self, uri: ImageUri, source: SourceUri,
+                 is_generic: bool = False,
                  **more: Any) -> None:  # pragma: no cover
         self.uri = uri
         self.source = source

diff --git a/nichtparasoup/core/imagecrawler.py b/nichtparasoup/core/imagecrawler.py
@@ -1,10 +1,11 @@
-__all__ = ["ImageCrawlerConfig", "BaseImageCrawler", "ImageCrawlerInfo"]
+__all__ = ["ImageCrawlerConfig", "BaseImageCrawler", "ImageCrawlerInfo", "RemoteFetcher", "ImageRecognizer"]
 
 from abc import ABC, abstractmethod
 from http.client import HTTPResponse
 from re import IGNORECASE as RE_IGNORECASE, compile as re_compile
 from threading import Lock
 from typing import Any, Dict, Optional, Pattern, Tuple
+from urllib.parse import urlparse
 from urllib.request import Request, urlopen
 
 from nichtparasoup._internals import _log
@@ -28,11 +29,13 @@ class ImageCrawlerConfig(Dict[_ImageCrawlerConfigKey, Any]):
 class BaseImageCrawler(ABC):
 
     def __init__(self, **config: Any) -> None:  # pragma: no cover
-        self._config = self.check_config(config)
+        self._config = self.check_config(config)  # intended to be immutable from now on
         self._reset_before_next_crawl = True
         self._crawl_lock = Lock()
-        _log('debug', 'crawler initialized {}({:x}) with: {!r}'.format(
-            type(self).__name__, id(self), self.get_config()))
+        _log('debug', 'crawler initialized: {!r}'.format(self))
+
+    def __repr__(self) -> str:
+        return '<{0.__module__}.{0.__name__} {1!r}>'.format(type(self), self.get_config())
 
     def __eq__(self, other: Any) -> bool:
         if type(self) is type(other):
@@ -41,90 +44,130 @@ def __eq__(self, other: Any) -> bool:
         return False
 
     def get_config(self) -> ImageCrawlerConfig:
-        return ImageCrawlerConfig(self._config)  # is just a shallow copy
+        """
+        Get all *public* information from the config
+
+        For internal access to the config using `self._config` is encouraged
+        """
+        return ImageCrawlerConfig({k: v for (k, v) in self._config.items() if not k.startswith('_')})
 
     def reset(self) -> None:
         self._reset_before_next_crawl = True
-        _log('debug', 'crawler reset planned {}({:x})'.format(type(self).__name__, id(self)))
+        _log('debug', 'crawler reset planned for {!r}'.format(self))
 
     def crawl(self) -> ImageCollection:  # pragma: no cover
-        debug_map = dict(type=type(self).__name__, id=id(self))
         with self._crawl_lock:
             try:
                 if self._reset_before_next_crawl:
-                    _log('debug', 'crawler resetting {type}({id:x})'.format_map(debug_map))
+                    _log('debug', 'crawler resetting {!r}'.format(self))
                     self._reset()
                     self._reset_before_next_crawl = False
-                _log('debug', 'crawling started {type}({id:x})'.format_map(debug_map))
+                _log('debug', 'crawling started {!r}'.format(self))
                 crawled = self._crawl()
-                _log('debug', 'crawling finished {type}({id:x})'.format_map(debug_map))
+                _log('debug', 'crawling finished {!r}'.format(self))
                 return crawled
             except Exception:
-                _log('exception', 'caught an error during crawling {type}({id:x})'.format_map(debug_map))
+                _log('exception', 'caught an error during crawling {!r}'.format(self))
                 return ImageCollection()
 
-    _RE_IMAGE_PATH = re_compile(r'.*\.(?:jpeg|jpg|png|gif)(?:[?#].*)?$', flags=RE_IGNORECASE)  # type: Pattern[str]
-
-    @classmethod
-    def path_is_image(cls, uri: str) -> bool:
-        return cls._RE_IMAGE_PATH.match(uri) is not None
-
-    _HEADERS_DEFAULT = {
-        'User-Agent': 'NichtParasoup',
-    }
-
     @classmethod
-    def fetch_remote_data(cls, uri: str,
-                          timeout: float = 10.0,
-                          headers: Optional[Dict[str, str]] = None) -> Tuple[str, str]:
-        _log('debug', 'fetch remote {!r} in {!r} with {!r}'.format(uri, timeout, headers))
-        request = Request(uri, headers={**cls._HEADERS_DEFAULT, **(headers or dict())})
-        response = urlopen(request, timeout=timeout)  # type: HTTPResponse
-        actual_uri = response.geturl()  # after following redirects ...
-        charset = str(response.info().get_param('charset', 'UTF-8'))
-        return response.read().decode(charset), actual_uri
-
-    @staticmethod
     @abstractmethod
-    def info() -> ImageCrawlerInfo:  # pragma: no cover
-        return ImageCrawlerInfo(
-            desc="Some textual description about what this ImageCrawler does.",
-            config=dict(
-                # leave the dict empty, if there is nothing to configure
-                param1="meaning of param1",
-                paramN="meaning of paramN",
-            ),
-            version='0.0.dev1',
-        )
+    def info(cls) -> ImageCrawlerInfo:  # pragma: no cover
+        """
+        Get info of the crawler
+
+        example implementation:
+            return ImageCrawlerInfo(
+                desc="Some textual description about what this ImageCrawler does.",
+                config=dict(
+                    # leave the dict empty, if there is nothing to configure
+                    param1="meaning of param1",
+                    paramN="meaning of paramN",
+                ),
+                version='0.0.dev1',
+            )
+        """
+        raise NotImplementedError()
 
-    @staticmethod
+    @classmethod
     @abstractmethod
-    def check_config(config: Dict[Any, Any]) -> ImageCrawlerConfig:  # pragma: no cover
+    def check_config(cls, config: Dict[Any, Any]) -> ImageCrawlerConfig:  # pragma: no cover
         """
-        this function is intended to check if a config is valid and to strip unused config.
+        This function is intended to check if a config is valid and to strip unused config.
 
-        when implementing:
-        check if any config is viable. if not raise ValueError or TypeError or KeyError or whatever Error
-        return the viable config for this crawler instance
+        When implementing:
+            Check if any config is viable. if not raise ValueError or TypeError or KeyError
+            or whatever Error.
+            Return the viable config for this crawler instance.
 
-        example:
+        Example implementation:
             height = config["height"]  # will raise KeyError automatically
             if type(height) is not int:
                 raise TypeError("height {} is not int".format(height))
             if height <= 0:
                 raise ValueError("height {} <= 0".format(width))
         """
-        return ImageCrawlerConfig(config)
+        raise NotImplementedError()
 
     @abstractmethod
     def _reset(self) -> None:  # pragma: no cover
         """
-        this function is intended to reset the crawler to restart at front
+        This function is intended to reset the crawler to restart at front
         """
+        raise NotImplementedError()
 
     @abstractmethod
     def _crawl(self) -> ImageCollection:  # pragma: no cover
         """
-        this function is intended to find and fetch ImageURIs
+        This function is intended to find and fetch ImageURIs
         """
-        return ImageCollection()
+        raise NotImplementedError()
+
+
+class RemoteFetcher(object):
+
+    _HEADERS_DEFAULT = {
+        'User-Agent': 'NichtParasoup',
+    }
+
+    def __init__(self, timeout: float = 10.0, headers: Optional[Dict[str, str]] = None) -> None:  # pragma: no cover
+        self._timeout = timeout
+        self._headers = self.__class__._HEADERS_DEFAULT.copy()
+        if headers:
+            self._headers.update(headers)
+
+    @staticmethod
+    def _valid_uri(uri: str) -> bool:
+        (scheme, _, _, _, _, _) = urlparse(uri)
+        return scheme in {'http', 'https'}
+
+    def get_stream(self, uri: str) -> Tuple[HTTPResponse, str]:
+        if not self._valid_uri(uri):
+            raise ValueError('not remote: ' + uri)
+        _log('debug', 'fetch remote {!r} in {}s with {!r}'.format(
+            uri, self._timeout, self._headers))
+        request = Request(uri, headers=self._headers)
+        try:
+            response = urlopen(request, timeout=self._timeout)  # type: HTTPResponse
+        except BaseException as e:
+            _log('debug', 'caught error on fetch remote {!r}'.format(uri), exc_info=True)
+            raise e
+        actual_uri = response.geturl()  # after following redirects ...
+        return response, actual_uri
+
+    def get_bytes(self, uri: str) -> Tuple[bytes, str]:
+        response, actual_uri = self.get_stream(uri)
+        return response.read(), actual_uri
+
+    def get_string(self, uri: str, charset_fallback: str = 'UTF-8') -> Tuple[str, str]:
+        response, actual_uri = self.get_stream(uri)
+        charset = str(response.info().get_param('charset', charset_fallback))
+        return response.read().decode(charset), actual_uri
+
+
+class ImageRecognizer(object):
+
+    _PATH_RE = re_compile(r'.+\.(?:jpeg|jpg|png|gif|svg)(?:[?#].*)?$', flags=RE_IGNORECASE)  # type: Pattern[str]
+
+    def path_is_image(self, uri: str) -> bool:
+        return self._PATH_RE.match(uri) is not None
diff --git a/nichtparasoup/core/server.py b/nichtparasoup/core/server.py
@@ -55,9 +55,7 @@ def get_image(self) -> Optional[Dict[str, Any]]:
     def _log_refill_crawler(crawler: Crawler, refilled: int) -> None:
         # must be compatible to nichtparasoup.core._OnFill
         if refilled > 0:
-            _log('info', "refilled via {}({:x}) by {}".format(
-                type(crawler.imagecrawler).__name__, id(crawler.imagecrawler),
-                refilled))
+            _log('info', "refilled by {} via {!r}".format(refilled, crawler.imagecrawler))
 
     def refill(self) -> Dict[str, bool]:
         with self._locks.refill:

diff --git a/nichtparasoup/imagecrawler/dummy.py b/nichtparasoup/imagecrawler/dummy.py
@@ -8,8 +8,8 @@
 
 class Dummy(BaseImageCrawler):
 
-    @staticmethod
-    def info() -> ImageCrawlerInfo:
+    @classmethod
+    def info(cls) -> ImageCrawlerInfo:
         from nichtparasoup import __version__
         return ImageCrawlerInfo(
             desc='"Finds" the same image ... again ... and again.',
@@ -19,8 +19,8 @@ def info() -> ImageCrawlerInfo:
             version=__version__,
         )
 
-    @staticmethod
-    def check_config(config: Dict[Any, Any]) -> ImageCrawlerConfig:
+    @classmethod
+    def check_config(cls, config: Dict[Any, Any]) -> ImageCrawlerConfig:
         image_uri = config["image_uri"]
         if type(image_uri) is not str:
             raise TypeError("image_uri {!r} is not str".format(image_uri))
@@ -37,7 +37,7 @@ def _crawl(self) -> ImageCollection:
         images = ImageCollection()
         config = self.get_config()
         images.add(Image(
-            config["image_uri"],
+            config["image_uri"], config["image_uri"],
             is_generic=True,
             this_is_a_dummy=True,
         ))

diff --git a/nichtparasoup/imagecrawler/picsum.py b/nichtparasoup/imagecrawler/picsum.py
@@ -1,17 +1,16 @@
 from typing import Any, Dict
 
-from nichtparasoup.core.image import Image, ImageCollection, ImageUri
+from nichtparasoup.core.image import Image, ImageCollection
 from nichtparasoup.core.imagecrawler import BaseImageCrawler, ImageCrawlerConfig, ImageCrawlerInfo
 
 __all__ = ["Picsum"]
 
 
 class Picsum(BaseImageCrawler):
-
     _bunch = 10
 
-    @staticmethod
-    def info() -> ImageCrawlerInfo:
+    @classmethod
+    def info(cls) -> ImageCrawlerInfo:
         from nichtparasoup import __version__
         return ImageCrawlerInfo(
             desc='Find images from https://picsum.photos',
@@ -22,8 +21,8 @@ def info() -> ImageCrawlerInfo:
             version=__version__,
         )
 
-    @staticmethod
-    def check_config(config: Dict[Any, Any]) -> ImageCrawlerConfig:
+    @classmethod
+    def check_config(cls, config: Dict[Any, Any]) -> ImageCrawlerConfig:
         width = config["width"]
         height = config["height"]
         if type(width) is not int:
@@ -40,7 +39,7 @@ def check_config(config: Dict[Any, Any]) -> ImageCrawlerConfig:
         )
 
     @staticmethod
-    def _get_image_uri(width: int, height: int) -> ImageUri:
+    def _get_image_uri(width: int, height: int) -> str:
         return "https://picsum.photos/{}/{}".format(width, height)
 
     def _reset(self) -> None:  # pragma: no cover
@@ -50,8 +49,9 @@ def _crawl(self) -> ImageCollection:
         images = ImageCollection()
         config = self.get_config()
         for _ in range(0, self._bunch):
+            uri = self._get_image_uri(**config)
             images.add(Image(
-                self._get_image_uri(**config),
+                uri, uri,
                 is_generic=True,
             ))
         return images