From a25ef6d716bb74b6f4a097451c8a061da0675292 Mon Sep 17 00:00:00 2001
From: Markus Schepke <markus.schepke@wolt.com>
Date: Sun, 1 Dec 2024 23:52:07 +0200
Subject: [PATCH] Added board_game_scraper.pipelines.LimitImagesPipeline

---
 src/board_game_scraper/items.py       |  3 +
 src/board_game_scraper/pipelines.py   | 85 +++++++++++++++++++++++++++
 src/board_game_scraper/settings.py    |  5 ++
 src/board_game_scraper/spiders/bgg.py |  1 +
 4 files changed, 94 insertions(+)
 create mode 100644 src/board_game_scraper/pipelines.py

diff --git a/src/board_game_scraper/items.py b/src/board_game_scraper/items.py
index 33f9912..7407387 100644
--- a/src/board_game_scraper/items.py
+++ b/src/board_game_scraper/items.py
@@ -29,6 +29,7 @@ class GameItem:
     url: str | None = None
     official_url: list[str] | None = None
     image_url: list[str] | None = None
+    image_url_download: list[str] | None = None
     image_file: list[dict[str, str]] | None = None
     image_blurhash: list[dict[str, str]] | None = None
     video_url: list[str] | None = None
@@ -112,6 +113,7 @@ class RankingItem:
     bayes_rating: float | None = None
 
     image_url: list[str] | None = None
+    image_url_download: list[str] | None = None
     image_file: list[dict[str, str]] | None = None
     image_blurhash: list[dict[str, str]] | None = None
 
@@ -140,6 +142,7 @@ class UserItem:
 
     external_link: list[str] | None = None
     image_url: list[str] | None = None
+    image_url_download: list[str] | None = None
     image_file: list[dict[str, str]] | None = None
     image_blurhash: list[dict[str, str]] | None = None
 
diff --git a/src/board_game_scraper/pipelines.py b/src/board_game_scraper/pipelines.py
new file mode 100644
index 0000000..19110d5
--- /dev/null
+++ b/src/board_game_scraper/pipelines.py
@@ -0,0 +1,85 @@
+"""Scrapy item pipelines"""
+
+from __future__ import annotations
+
+from itertools import islice
+from typing import TYPE_CHECKING
+
+from itemadapter import ItemAdapter
+from scrapy.exceptions import NotConfigured
+from scrapy.utils.misc import arg_to_iter
+
+if TYPE_CHECKING:
+    from typing import Self, TypeVar
+
+    from scrapy import Spider
+    from scrapy.crawler import Crawler
+
+    Typed = TypeVar("Typed")
+
+
+class LimitImagesPipeline:
+    """Copy a limited number of image URLs to be downloaded from source to target."""
+
+    source_field: str
+    target_field: str
+    limit: int | None = None
+
+    @classmethod
+    def from_crawler(cls, crawler: Crawler) -> Self:
+        """Init from crawler."""
+
+        source_field = crawler.settings.get("LIMIT_IMAGES_URLS_FIELD")
+        target_field = crawler.settings.get("IMAGES_URLS_FIELD")
+
+        if not source_field or not target_field:
+            raise NotConfigured
+
+        limit = crawler.settings.getint("LIMIT_IMAGES_TO_DOWNLOAD", -1)
+
+        return cls(
+            source_field=source_field,
+            target_field=target_field,
+            limit=limit,
+        )
+
+    def __init__(
+        self,
+        source_field: str,
+        target_field: str,
+        limit: int | None = None,
+    ):
+        self.source_field = source_field
+        self.target_field = target_field
+        self.limit = limit
+
+    def process_item(
+        self,
+        item: Typed,
+        spider: Spider,  # noqa: ARG002
+    ) -> Typed:
+        """
+        Copy a limited number of image URLs to be downloaded from source to target.
+        """
+
+        adapter = ItemAdapter(item)
+
+        # adding target field would result in error; return item as-is
+        if self.target_field not in adapter.field_names():
+            return item
+
+        if self.limit is None or self.limit < 0:  # copy through everything
+            adapter[self.target_field] = list(
+                arg_to_iter(adapter.get(self.source_field)),
+            )
+            return item
+
+        if not self.limit:  # limit is zero
+            adapter[self.target_field] = []
+            return item
+
+        # actual limit
+        adapter[self.target_field] = list(
+            islice(arg_to_iter(adapter.get(self.source_field)), self.limit),
+        )
+        return item
diff --git a/src/board_game_scraper/settings.py b/src/board_game_scraper/settings.py
index 10f4a6b..3cc561d 100644
--- a/src/board_game_scraper/settings.py
+++ b/src/board_game_scraper/settings.py
@@ -121,6 +121,7 @@
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
+    "board_game_scraper.pipelines.LimitImagesPipeline": 500,
     "scrapy.pipelines.images.ImagesPipeline": 600,
     "scrapy_extensions.BlurHashPipeline": 700,
 }
@@ -176,6 +177,10 @@
 IMAGES_EXPIRES = 360
 # IMAGES_THUMBS = {"thumb": (1024, 1024)}
 
+# Limit images to download
+LIMIT_IMAGES_TO_DOWNLOAD = 0
+LIMIT_IMAGES_URLS_FIELD = "image_url"
+
 # BlurHash
 BLURHASH_FIELD = "image_blurhash"
 BLURHASH_X_COMPONENTS = 4
diff --git a/src/board_game_scraper/spiders/bgg.py b/src/board_game_scraper/spiders/bgg.py
index f5d7fc0..58ba85d 100644
--- a/src/board_game_scraper/spiders/bgg.py
+++ b/src/board_game_scraper/spiders/bgg.py
@@ -77,6 +77,7 @@ class BggSpider(SitemapSpider):
     custom_settings = {  # noqa: RUF012
         "DOWNLOAD_DELAY": 2.0,
         "AUTOTHROTTLE_TARGET_CONCURRENCY": 4,
+        "LIMIT_IMAGES_TO_DOWNLOAD": 1,
     }
 
     def __init__(