Skip to content

Commit

Permalink
Added board_game_scraper.pipelines.LimitImagesPipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
MarkusShepherd committed Dec 1, 2024
1 parent dafb2d2 commit a25ef6d
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 0 deletions.
3 changes: 3 additions & 0 deletions src/board_game_scraper/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class GameItem:
url: str | None = None
official_url: list[str] | None = None
image_url: list[str] | None = None
image_url_download: list[str] | None = None
image_file: list[dict[str, str]] | None = None
image_blurhash: list[dict[str, str]] | None = None
video_url: list[str] | None = None
Expand Down Expand Up @@ -112,6 +113,7 @@ class RankingItem:
bayes_rating: float | None = None

image_url: list[str] | None = None
image_url_download: list[str] | None = None
image_file: list[dict[str, str]] | None = None
image_blurhash: list[dict[str, str]] | None = None

Expand Down Expand Up @@ -140,6 +142,7 @@ class UserItem:

external_link: list[str] | None = None
image_url: list[str] | None = None
image_url_download: list[str] | None = None
image_file: list[dict[str, str]] | None = None
image_blurhash: list[dict[str, str]] | None = None

Expand Down
85 changes: 85 additions & 0 deletions src/board_game_scraper/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""Scrapy item pipelines"""

from __future__ import annotations

from itertools import islice
from typing import TYPE_CHECKING

from itemadapter import ItemAdapter
from scrapy.exceptions import NotConfigured
from scrapy.utils.misc import arg_to_iter

if TYPE_CHECKING:
from typing import Self, TypeVar

from scrapy import Spider
from scrapy.crawler import Crawler

Typed = TypeVar("Typed")


class LimitImagesPipeline:
"""Copy a limited number of image URLs to be downloaded from source to target."""

source_field: str
target_field: str
limit: int | None = None

@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
"""Init from crawler."""

source_field = crawler.settings.get("LIMIT_IMAGES_URLS_FIELD")
target_field = crawler.settings.get("IMAGES_URLS_FIELD")

if not source_field or not target_field:
raise NotConfigured

limit = crawler.settings.getint("LIMIT_IMAGES_TO_DOWNLOAD", -1)

return cls(
source_field=source_field,
target_field=target_field,
limit=limit,
)

def __init__(
self,
source_field: str,
target_field: str,
limit: int | None = None,
):
self.source_field = source_field
self.target_field = target_field
self.limit = limit

def process_item(
self,
item: Typed,
spider: Spider, # noqa: ARG002
) -> Typed:
"""
Copy a limited number of image URLs to be downloaded from source to target.
"""

adapter = ItemAdapter(item)

# adding target field would result in error; return item as-is
if self.target_field not in adapter.field_names():
return item

if self.limit is None or self.limit < 0: # copy through everything
adapter[self.target_field] = list(
arg_to_iter(adapter.get(self.source_field)),
)
return item

if not self.limit: # limit is zero
adapter[self.target_field] = []
return item

# actual limit
adapter[self.target_field] = list(
islice(arg_to_iter(adapter.get(self.source_field)), self.limit),
)
return item
5 changes: 5 additions & 0 deletions src/board_game_scraper/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"board_game_scraper.pipelines.LimitImagesPipeline": 500,
"scrapy.pipelines.images.ImagesPipeline": 600,
"scrapy_extensions.BlurHashPipeline": 700,
}
Expand Down Expand Up @@ -176,6 +177,10 @@
IMAGES_EXPIRES = 360
# IMAGES_THUMBS = {"thumb": (1024, 1024)}

# Limit images to download
LIMIT_IMAGES_TO_DOWNLOAD = 0
LIMIT_IMAGES_URLS_FIELD = "image_url"

# BlurHash
BLURHASH_FIELD = "image_blurhash"
BLURHASH_X_COMPONENTS = 4
Expand Down
1 change: 1 addition & 0 deletions src/board_game_scraper/spiders/bgg.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ class BggSpider(SitemapSpider):
custom_settings = { # noqa: RUF012
"DOWNLOAD_DELAY": 2.0,
"AUTOTHROTTLE_TARGET_CONCURRENCY": 4,
"LIMIT_IMAGES_TO_DOWNLOAD": 1,
}

def __init__(
Expand Down

0 comments on commit a25ef6d

Please sign in to comment.