From a25ef6d716bb74b6f4a097451c8a061da0675292 Mon Sep 17 00:00:00 2001 From: Markus Schepke <markus.schepke@wolt.com> Date: Sun, 1 Dec 2024 23:52:07 +0200 Subject: [PATCH] Added board_game_scraper.pipelines.LimitImagesPipeline --- src/board_game_scraper/items.py | 3 + src/board_game_scraper/pipelines.py | 85 +++++++++++++++++++++++++++ src/board_game_scraper/settings.py | 5 ++ src/board_game_scraper/spiders/bgg.py | 1 + 4 files changed, 94 insertions(+) create mode 100644 src/board_game_scraper/pipelines.py diff --git a/src/board_game_scraper/items.py b/src/board_game_scraper/items.py index 33f9912..7407387 100644 --- a/src/board_game_scraper/items.py +++ b/src/board_game_scraper/items.py @@ -29,6 +29,7 @@ class GameItem: url: str | None = None official_url: list[str] | None = None image_url: list[str] | None = None + image_url_download: list[str] | None = None image_file: list[dict[str, str]] | None = None image_blurhash: list[dict[str, str]] | None = None video_url: list[str] | None = None @@ -112,6 +113,7 @@ class RankingItem: bayes_rating: float | None = None image_url: list[str] | None = None + image_url_download: list[str] | None = None image_file: list[dict[str, str]] | None = None image_blurhash: list[dict[str, str]] | None = None @@ -140,6 +142,7 @@ class UserItem: external_link: list[str] | None = None image_url: list[str] | None = None + image_url_download: list[str] | None = None image_file: list[dict[str, str]] | None = None image_blurhash: list[dict[str, str]] | None = None diff --git a/src/board_game_scraper/pipelines.py b/src/board_game_scraper/pipelines.py new file mode 100644 index 0000000..19110d5 --- /dev/null +++ b/src/board_game_scraper/pipelines.py @@ -0,0 +1,85 @@ +"""Scrapy item pipelines""" + +from __future__ import annotations + +from itertools import islice +from typing import TYPE_CHECKING + +from itemadapter import ItemAdapter +from scrapy.exceptions import NotConfigured +from scrapy.utils.misc import arg_to_iter + +if TYPE_CHECKING: + from typing import Self, TypeVar + + from scrapy import Spider + from scrapy.crawler import Crawler + + Typed = TypeVar("Typed") + + +class LimitImagesPipeline: + """Copy a limited number of image URLs to be downloaded from source to target.""" + + source_field: str + target_field: str + limit: int | None = None + + @classmethod + def from_crawler(cls, crawler: Crawler) -> Self: + """Init from crawler.""" + + source_field = crawler.settings.get("LIMIT_IMAGES_URLS_FIELD") + target_field = crawler.settings.get("IMAGES_URLS_FIELD") + + if not source_field or not target_field: + raise NotConfigured + + limit = crawler.settings.getint("LIMIT_IMAGES_TO_DOWNLOAD", -1) + + return cls( + source_field=source_field, + target_field=target_field, + limit=limit, + ) + + def __init__( + self, + source_field: str, + target_field: str, + limit: int | None = None, + ): + self.source_field = source_field + self.target_field = target_field + self.limit = limit + + def process_item( + self, + item: Typed, + spider: Spider, # noqa: ARG002 + ) -> Typed: + """ + Copy a limited number of image URLs to be downloaded from source to target. + """ + + adapter = ItemAdapter(item) + + # adding target field would result in error; return item as-is + if self.target_field not in adapter.field_names(): + return item + + if self.limit is None or self.limit < 0: # copy through everything + adapter[self.target_field] = list( + arg_to_iter(adapter.get(self.source_field)), + ) + return item + + if not self.limit: # limit is zero + adapter[self.target_field] = [] + return item + + # actual limit + adapter[self.target_field] = list( + islice(arg_to_iter(adapter.get(self.source_field)), self.limit), + ) + return item diff --git a/src/board_game_scraper/settings.py b/src/board_game_scraper/settings.py index 10f4a6b..3cc561d 100644 --- a/src/board_game_scraper/settings.py +++ b/src/board_game_scraper/settings.py @@ -121,6 +121,7 @@ # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { + "board_game_scraper.pipelines.LimitImagesPipeline": 500, "scrapy.pipelines.images.ImagesPipeline": 600, "scrapy_extensions.BlurHashPipeline": 700, } @@ -176,6 +177,10 @@ IMAGES_EXPIRES = 360 # IMAGES_THUMBS = {"thumb": (1024, 1024)} +# Limit images to download +LIMIT_IMAGES_TO_DOWNLOAD = 0 +LIMIT_IMAGES_URLS_FIELD = "image_url" + # BlurHash BLURHASH_FIELD = "image_blurhash" BLURHASH_X_COMPONENTS = 4 diff --git a/src/board_game_scraper/spiders/bgg.py b/src/board_game_scraper/spiders/bgg.py index f5d7fc0..58ba85d 100644 --- a/src/board_game_scraper/spiders/bgg.py +++ b/src/board_game_scraper/spiders/bgg.py @@ -77,6 +77,7 @@ class BggSpider(SitemapSpider): custom_settings = { # noqa: RUF012 "DOWNLOAD_DELAY": 2.0, "AUTOTHROTTLE_TARGET_CONCURRENCY": 4, + "LIMIT_IMAGES_TO_DOWNLOAD": 1, } def __init__(