Skip to content

Commit

Permalink
2.0 dev imagecrawler revisit (#164)
Browse files Browse the repository at this point in the history
* image-source became mandatory
* encouraged internal use of `_config` over `get_config()` for ImageCrawlers
* `RemoteFetcher` moved out of `BaseImageCrawler`
* `ImageRecognizer` moved out of `BaseImageCrawler`
* image-uri and -source source revisited, internal renames
* revisited BaseImageCrawler
* imagecrawler repr revisited
* used startswith instead of str[a:o]
  • Loading branch information
jkowalleck authored Nov 25, 2019
1 parent d501a09 commit ba7c9c8
Show file tree
Hide file tree
Showing 17 changed files with 366 additions and 134 deletions.
6 changes: 3 additions & 3 deletions nichtparasoup/_internals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@

_logger = logging.getLogger('nichtparasoup')

_LOG_TYPE = Literal['debug', 'info', 'warning', 'error', 'critical', 'log', 'exception']
_LOG_LEVEL = Literal['debug', 'info', 'warning', 'error', 'critical', 'log', 'exception']


def _log(type: _LOG_TYPE, message: str, *args: Any, **kwargs: Any) -> None:
def _log(level: _LOG_LEVEL, message: str, *args: Any, **kwargs: Any) -> None:
if not logging.root.handlers and _logger.level == logging.NOTSET:
_logger.setLevel(logging.INFO)
_logger.addHandler(logging.StreamHandler())
getattr(_logger, type)(message.rstrip(), *args, **kwargs)
getattr(_logger, level)(message.rstrip(), *args, **kwargs)


def _message(message: str, color: Optional[str] = None, file: Optional[TextIO] = None) -> None:
Expand Down
48 changes: 43 additions & 5 deletions nichtparasoup/core/image.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,55 @@
__all__ = ["Image", "ImageCollection", "ImageSource", "ImageUri"]
__all__ = ["Image", "ImageCollection"]

from typing import Any, Optional, Set
from typing import Any, Set
from uuid import uuid4

ImageUri = str

ImageSource = str
SourceUri = str


class Image(object):
"""Describe an image
def __init__(self, uri: ImageUri,
is_generic: bool = False, source: Optional[ImageSource] = None,
`uri`
The absolute URI of the image. This basically identifies the Image and makes it unique.
This absolute URI must include: ``scheme``, ``host``.
``schema`` must be either 'http' or 'https' - the last one is preferred.
Optional are: ``port``, ``path``, ``query``, ``fragment``.
`source`
The URI where did the image is originally found?
This URI can point to a ImageBoardThread, or a comment section in a Forum, or a news article...
In the idea of fair use, it is encouraged to point to the source as good as possible.
This absolute URI must include: ``scheme``, ``host``.
``schema`` must be either 'http' or 'https' - the last one is preferred.
Optional are: ``port``, ``path``, ``query``, ``fragment``.
Good examples are:
* https://www.reddit.com/r/Awww/comments/e1er0c/say_hi_to_loki_hes_just_contemplating/
* https://giphy.com/gifs/10kABVanhwykJW
`is_generic`
If a generic image crawler is used, its common that each image URI looks exactly the same.
To make this known, use this flag.
`more`
A dictionary of additional information an image crawler might want to deliver.
This dictionary's data types are intended to the basic ones: string, int, float, list, set, dict, bool, None
Good examples are:
* image-dimensions
* author, copyright information
* valid-until
"""

def __init__(self, uri: ImageUri, source: SourceUri,
is_generic: bool = False,
**more: Any) -> None: # pragma: no cover
self.uri = uri
self.source = source
Expand Down
149 changes: 96 additions & 53 deletions nichtparasoup/core/imagecrawler.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
__all__ = ["ImageCrawlerConfig", "BaseImageCrawler", "ImageCrawlerInfo"]
__all__ = ["ImageCrawlerConfig", "BaseImageCrawler", "ImageCrawlerInfo", "RemoteFetcher", "ImageRecognizer"]

from abc import ABC, abstractmethod
from http.client import HTTPResponse
from re import IGNORECASE as RE_IGNORECASE, compile as re_compile
from threading import Lock
from typing import Any, Dict, Optional, Pattern, Tuple
from urllib.parse import urlparse
from urllib.request import Request, urlopen

from nichtparasoup._internals import _log
Expand All @@ -28,11 +29,13 @@ class ImageCrawlerConfig(Dict[_ImageCrawlerConfigKey, Any]):
class BaseImageCrawler(ABC):

def __init__(self, **config: Any) -> None: # pragma: no cover
self._config = self.check_config(config)
self._config = self.check_config(config) # intended to be immutable from now on
self._reset_before_next_crawl = True
self._crawl_lock = Lock()
_log('debug', 'crawler initialized {}({:x}) with: {!r}'.format(
type(self).__name__, id(self), self.get_config()))
_log('debug', 'crawler initialized: {!r}'.format(self))

def __repr__(self) -> str:
return '<{0.__module__}.{0.__name__} {1!r}>'.format(type(self), self.get_config())

def __eq__(self, other: Any) -> bool:
if type(self) is type(other):
Expand All @@ -41,90 +44,130 @@ def __eq__(self, other: Any) -> bool:
return False

def get_config(self) -> ImageCrawlerConfig:
return ImageCrawlerConfig(self._config) # is just a shallow copy
"""
Get all *public* information from the config
For internal access to the config using `self._config` is encouraged
"""
return ImageCrawlerConfig({k: v for (k, v) in self._config.items() if not k.startswith('_')})

def reset(self) -> None:
self._reset_before_next_crawl = True
_log('debug', 'crawler reset planned {}({:x})'.format(type(self).__name__, id(self)))
_log('debug', 'crawler reset planned for {!r}'.format(self))

def crawl(self) -> ImageCollection: # pragma: no cover
debug_map = dict(type=type(self).__name__, id=id(self))
with self._crawl_lock:
try:
if self._reset_before_next_crawl:
_log('debug', 'crawler resetting {type}({id:x})'.format_map(debug_map))
_log('debug', 'crawler resetting {!r}'.format(self))
self._reset()
self._reset_before_next_crawl = False
_log('debug', 'crawling started {type}({id:x})'.format_map(debug_map))
_log('debug', 'crawling started {!r}'.format(self))
crawled = self._crawl()
_log('debug', 'crawling finished {type}({id:x})'.format_map(debug_map))
_log('debug', 'crawling finished {!r}'.format(self))
return crawled
except Exception:
_log('exception', 'caught an error during crawling {type}({id:x})'.format_map(debug_map))
_log('exception', 'caught an error during crawling {!r}'.format(self))
return ImageCollection()

_RE_IMAGE_PATH = re_compile(r'.*\.(?:jpeg|jpg|png|gif)(?:[?#].*)?$', flags=RE_IGNORECASE) # type: Pattern[str]

@classmethod
def path_is_image(cls, uri: str) -> bool:
return cls._RE_IMAGE_PATH.match(uri) is not None

_HEADERS_DEFAULT = {
'User-Agent': 'NichtParasoup',
}

@classmethod
def fetch_remote_data(cls, uri: str,
timeout: float = 10.0,
headers: Optional[Dict[str, str]] = None) -> Tuple[str, str]:
_log('debug', 'fetch remote {!r} in {!r} with {!r}'.format(uri, timeout, headers))
request = Request(uri, headers={**cls._HEADERS_DEFAULT, **(headers or dict())})
response = urlopen(request, timeout=timeout) # type: HTTPResponse
actual_uri = response.geturl() # after following redirects ...
charset = str(response.info().get_param('charset', 'UTF-8'))
return response.read().decode(charset), actual_uri

@staticmethod
@abstractmethod
def info() -> ImageCrawlerInfo: # pragma: no cover
return ImageCrawlerInfo(
desc="Some textual description about what this ImageCrawler does.",
config=dict(
# leave the dict empty, if there is nothing to configure
param1="meaning of param1",
paramN="meaning of paramN",
),
version='0.0.dev1',
)
def info(cls) -> ImageCrawlerInfo: # pragma: no cover
"""
Get info of the crawler
example implementation:
return ImageCrawlerInfo(
desc="Some textual description about what this ImageCrawler does.",
config=dict(
# leave the dict empty, if there is nothing to configure
param1="meaning of param1",
paramN="meaning of paramN",
),
version='0.0.dev1',
)
"""
raise NotImplementedError()

@staticmethod
@classmethod
@abstractmethod
def check_config(config: Dict[Any, Any]) -> ImageCrawlerConfig: # pragma: no cover
def check_config(cls, config: Dict[Any, Any]) -> ImageCrawlerConfig: # pragma: no cover
"""
this function is intended to check if a config is valid and to strip unused config.
This function is intended to check if a config is valid and to strip unused config.
when implementing:
check if any config is viable. if not raise ValueError or TypeError or KeyError or whatever Error
return the viable config for this crawler instance
When implementing:
Check if any config is viable. if not raise ValueError or TypeError or KeyError
or whatever Error.
Return the viable config for this crawler instance.
example:
Example implementation:
height = config["height"] # will raise KeyError automatically
if type(height) is not int:
raise TypeError("height {} is not int".format(height))
if height <= 0:
raise ValueError("height {} <= 0".format(width))
"""
return ImageCrawlerConfig(config)
raise NotImplementedError()

@abstractmethod
def _reset(self) -> None: # pragma: no cover
"""
this function is intended to reset the crawler to restart at front
This function is intended to reset the crawler to restart at front
"""
raise NotImplementedError()

@abstractmethod
def _crawl(self) -> ImageCollection: # pragma: no cover
"""
this function is intended to find and fetch ImageURIs
This function is intended to find and fetch ImageURIs
"""
return ImageCollection()
raise NotImplementedError()


class RemoteFetcher(object):

_HEADERS_DEFAULT = {
'User-Agent': 'NichtParasoup',
}

def __init__(self, timeout: float = 10.0, headers: Optional[Dict[str, str]] = None) -> None: # pragma: no cover
self._timeout = timeout
self._headers = self.__class__._HEADERS_DEFAULT.copy()
if headers:
self._headers.update(headers)

@staticmethod
def _valid_uri(uri: str) -> bool:
(scheme, _, _, _, _, _) = urlparse(uri)
return scheme in {'http', 'https'}

def get_stream(self, uri: str) -> Tuple[HTTPResponse, str]:
if not self._valid_uri(uri):
raise ValueError('not remote: ' + uri)
_log('debug', 'fetch remote {!r} in {}s with {!r}'.format(
uri, self._timeout, self._headers))
request = Request(uri, headers=self._headers)
try:
response = urlopen(request, timeout=self._timeout) # type: HTTPResponse
except BaseException as e:
_log('debug', 'caught error on fetch remote {!r}'.format(uri), exc_info=True)
raise e
actual_uri = response.geturl() # after following redirects ...
return response, actual_uri

def get_bytes(self, uri: str) -> Tuple[bytes, str]:
response, actual_uri = self.get_stream(uri)
return response.read(), actual_uri

def get_string(self, uri: str, charset_fallback: str = 'UTF-8') -> Tuple[str, str]:
response, actual_uri = self.get_stream(uri)
charset = str(response.info().get_param('charset', charset_fallback))
return response.read().decode(charset), actual_uri


class ImageRecognizer(object):

_PATH_RE = re_compile(r'.+\.(?:jpeg|jpg|png|gif|svg)(?:[?#].*)?$', flags=RE_IGNORECASE) # type: Pattern[str]

def path_is_image(self, uri: str) -> bool:
return self._PATH_RE.match(uri) is not None
4 changes: 1 addition & 3 deletions nichtparasoup/core/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,7 @@ def get_image(self) -> Optional[Dict[str, Any]]:
def _log_refill_crawler(crawler: Crawler, refilled: int) -> None:
# must be compatible to nichtparasoup.core._OnFill
if refilled > 0:
_log('info', "refilled via {}({:x}) by {}".format(
type(crawler.imagecrawler).__name__, id(crawler.imagecrawler),
refilled))
_log('info', "refilled by {} via {!r}".format(refilled, crawler.imagecrawler))

def refill(self) -> Dict[str, bool]:
with self._locks.refill:
Expand Down
10 changes: 5 additions & 5 deletions nichtparasoup/imagecrawler/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

class Dummy(BaseImageCrawler):

@staticmethod
def info() -> ImageCrawlerInfo:
@classmethod
def info(cls) -> ImageCrawlerInfo:
from nichtparasoup import __version__
return ImageCrawlerInfo(
desc='"Finds" the same image ... again ... and again.',
Expand All @@ -19,8 +19,8 @@ def info() -> ImageCrawlerInfo:
version=__version__,
)

@staticmethod
def check_config(config: Dict[Any, Any]) -> ImageCrawlerConfig:
@classmethod
def check_config(cls, config: Dict[Any, Any]) -> ImageCrawlerConfig:
image_uri = config["image_uri"]
if type(image_uri) is not str:
raise TypeError("image_uri {!r} is not str".format(image_uri))
Expand All @@ -37,7 +37,7 @@ def _crawl(self) -> ImageCollection:
images = ImageCollection()
config = self.get_config()
images.add(Image(
config["image_uri"],
config["image_uri"], config["image_uri"],
is_generic=True,
this_is_a_dummy=True,
))
Expand Down
16 changes: 8 additions & 8 deletions nichtparasoup/imagecrawler/picsum.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
from typing import Any, Dict

from nichtparasoup.core.image import Image, ImageCollection, ImageUri
from nichtparasoup.core.image import Image, ImageCollection
from nichtparasoup.core.imagecrawler import BaseImageCrawler, ImageCrawlerConfig, ImageCrawlerInfo

__all__ = ["Picsum"]


class Picsum(BaseImageCrawler):

_bunch = 10

@staticmethod
def info() -> ImageCrawlerInfo:
@classmethod
def info(cls) -> ImageCrawlerInfo:
from nichtparasoup import __version__
return ImageCrawlerInfo(
desc='Find images from https://picsum.photos',
Expand All @@ -22,8 +21,8 @@ def info() -> ImageCrawlerInfo:
version=__version__,
)

@staticmethod
def check_config(config: Dict[Any, Any]) -> ImageCrawlerConfig:
@classmethod
def check_config(cls, config: Dict[Any, Any]) -> ImageCrawlerConfig:
width = config["width"]
height = config["height"]
if type(width) is not int:
Expand All @@ -40,7 +39,7 @@ def check_config(config: Dict[Any, Any]) -> ImageCrawlerConfig:
)

@staticmethod
def _get_image_uri(width: int, height: int) -> ImageUri:
def _get_image_uri(width: int, height: int) -> str:
return "https://picsum.photos/{}/{}".format(width, height)

def _reset(self) -> None: # pragma: no cover
Expand All @@ -50,8 +49,9 @@ def _crawl(self) -> ImageCollection:
images = ImageCollection()
config = self.get_config()
for _ in range(0, self._bunch):
uri = self._get_image_uri(**config)
images.add(Image(
self._get_image_uri(**config),
uri, uri,
is_generic=True,
))
return images
Loading

0 comments on commit ba7c9c8

Please sign in to comment.