diff --git a/src/board_game_scraper/middlewares.py b/src/board_game_scraper/middlewares.py deleted file mode 100644 index e9308fb..0000000 --- a/src/board_game_scraper/middlewares.py +++ /dev/null @@ -1,133 +0,0 @@ -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -# useful for handling different item types with a single interface - - -class BoardGameScraperSpiderMiddleware: - pass - - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - # @classmethod - # def from_crawler(cls, crawler: scrapy.crawler.Crawler): - # # This method is used by Scrapy to create your spiders. - # s = cls() - # crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - # return s - - # def process_spider_input( - # self, - # response: scrapy.http.Response, - # spider: scrapy.Spider, - # ): - # # Called for each response that goes through the spider - # # middleware and into the spider. - - # # Should return None or raise an exception. - # return None - - # def process_spider_output( - # self, - # response: scrapy.http.Response, - # result, - # spider: scrapy.Spider, - # ): - # # Called with the results returned from the Spider, after - # # it has processed the response. - - # # Must return an iterable of Request, or item objects. - # yield from result - - # def process_spider_exception( - # self, - # response: scrapy.http.Response, - # exception, - # spider: scrapy.Spider, - # ): - # # Called when a spider or process_spider_input() method - # # (from other spider middleware) raises an exception. - - # # Should return either None or an iterable of Request or item objects. - # pass - - # def process_start_requests( - # self, - # start_requests, - # spider: scrapy.Spider, - # ): - # # Called with the start requests of the spider, and works - # # similarly to the process_spider_output() method, except - # # that it doesn't have a response associated. - - # # Must return only requests (not items). - # yield from start_requests - - # def spider_opened(self, spider): - # spider.logger.info("Spider opened: %s", spider.name) - - -class BoardGameScraperDownloaderMiddleware: - pass - - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - # @classmethod - # def from_crawler(cls, crawler): - # # This method is used by Scrapy to create your spiders. - # s = cls() - # crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - # return s - - # def process_request(self, request, spider): - # # Called for each request that goes through the downloader - # # middleware. - - # # Must either: - # # - return None: continue processing this request - # # - or return a Response object - # # - or return a Request object - # # - or raise IgnoreRequest: process_exception() methods of - # # installed downloader middleware will be called - # return None - - # def process_response( - # self, - # request, - # response: scrapy.http.Response, - # spider: scrapy.Spider, - # ): - # # Called with the response returned from the downloader. - - # # Must either; - # # - return a Response object - # # - return a Request object - # # - or raise IgnoreRequest - # return response - - # def process_exception( - # self, - # request, - # exception, - # spider: scrapy.Spider, - # ): - # # Called when a download handler or a process_request() - # # (from other downloader middleware) raises an exception. - - # # Must either: - # # - return None: continue processing this exception - # # - return a Response object: stops process_exception() chain - # # - return a Request object: stops process_exception() chain - # pass - - # def spider_opened( - # self, - # spider: scrapy.Spider, - # ): - # spider.logger.info("Spider opened: %s", spider.name) diff --git a/src/board_game_scraper/pipelines.py b/src/board_game_scraper/pipelines.py deleted file mode 100644 index c32def0..0000000 --- a/src/board_game_scraper/pipelines.py +++ /dev/null @@ -1,14 +0,0 @@ -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - - -# useful for handling different item types with a single interface - - -class BoardGameScraperPipeline: - pass - - # def process_item(self, item, spider): - # return item diff --git a/src/board_game_scraper/settings.py b/src/board_game_scraper/settings.py index 8ef8feb..de84ef4 100644 --- a/src/board_game_scraper/settings.py +++ b/src/board_game_scraper/settings.py @@ -1,92 +1,154 @@ -# Scrapy settings for board_game_scraper project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html +import os +from pathlib import Path -BOT_NAME = "board_game_scraper" +BOT_NAME = "board-game-scraper" SPIDER_MODULES = ["board_game_scraper.spiders"] NEWSPIDER_MODULE = "board_game_scraper.spiders" +LOG_LEVEL = os.getenv("LOG_LEVEL") or "INFO" +LOG_FORMATTER = "scrapy_extensions.QuietLogFormatter" +LOG_SCRAPED_ITEMS = os.getenv("LOG_SCRAPED_ITEMS") + +BASE_DIR = Path(__file__).resolve().parent.parent.parent # Crawl responsibly by identifying yourself (and your website) on the user-agent -USER_AGENT = "board_game_scraper (+https://recommend.games)" +USER_AGENT = "board-game-scraper (+https://recommend.games)" # Obey robots.txt rules ROBOTSTXT_OBEY = True ROBOTSTXT_PARSER = "scrapy.robotstxt.PythonRobotParser" # Configure maximum concurrent requests performed by Scrapy (default: 16) -# CONCURRENT_REQUESTS = 32 +CONCURRENT_REQUESTS = 8 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -# DOWNLOAD_DELAY = 3 +DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: -# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +CONCURRENT_REQUESTS_PER_DOMAIN = 8 # CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) -# COOKIES_ENABLED = False +COOKIES_ENABLED = True # Disable Telnet Console (enabled by default) -# TELNETCONSOLE_ENABLED = False +TELNETCONSOLE_ENABLED = True # Override the default request headers: -# DEFAULT_REQUEST_HEADERS = { -# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", -# "Accept-Language": "en", -# } +DEFAULT_REQUEST_HEADERS = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en", +} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -# SPIDER_MIDDLEWARES = { -# "board_game_scraper.middlewares.BoardGameScraperSpiderMiddleware": 543, -# } +SPIDER_MIDDLEWARES = { + "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 50, + "scrapy.spidermiddlewares.referer.RefererMiddleware": 700, + "scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800, + "scrapy.spidermiddlewares.depth.DepthMiddleware": 900, +} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# DOWNLOADER_MIDDLEWARES = { -# "board_game_scraper.middlewares.BoardGameScraperDownloaderMiddleware": 543, -# } +DOWNLOADER_MIDDLEWARES = { + "scrapy.downloadermiddlewares.offsite.OffsiteMiddleware": 50, + "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": 100, + "scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware": 300, + "scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware": 350, + "scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware": 400, + "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": 500, + "scrapy.downloadermiddlewares.retry.RetryMiddleware": None, + "scrapy_extensions.DelayedRetryMiddleware": 555, + "scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware": 560, + "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": 580, + "scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware": 590, + "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": 600, + "scrapy.downloadermiddlewares.cookies.CookiesMiddleware": 700, + "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 750, + "scrapy.downloadermiddlewares.stats.DownloaderStats": 850, + "scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware": 900, +} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html -# EXTENSIONS = { -# "scrapy.extensions.telnet.TelnetConsole": None, -# } +EXTENSIONS = { + "scrapy.extensions.corestats.CoreStats": 0, + "scrapy.extensions.telnet.TelnetConsole": 0, + "scrapy.extensions.memusage.MemoryUsage": 0, + "scrapy.extensions.memdebug.MemoryDebugger": 0, + "scrapy.extensions.closespider.CloseSpider": 0, + "scrapy.extensions.feedexport.FeedExporter": 0, + "scrapy.extensions.logstats.LogStats": 0, + "scrapy.extensions.spiderstate.SpiderState": 0, + "scrapy.extensions.throttle.AutoThrottle": None, + "scrapy_extensions.NicerAutoThrottle": 0, +} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -# ITEM_PIPELINES = { -# "board_game_scraper.pipelines.BoardGameScraperPipeline": 300, -# } +ITEM_PIPELINES = { + "scrapy.pipelines.images.ImagesPipeline": 600, + "scrapy_extensions.BlurHashPipeline": 700, +} + +# See https://doc.scrapy.org/en/latest/topics/extensions.html#module-scrapy.extensions.closespider +CLOSESPIDER_TIMEOUT = os.getenv("CLOSESPIDER_TIMEOUT") # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html -# AUTOTHROTTLE_ENABLED = True +AUTOTHROTTLE_ENABLED = True # The initial download delay -# AUTOTHROTTLE_START_DELAY = 5 +AUTOTHROTTLE_START_DELAY = max(DOWNLOAD_DELAY * 2, 5) # The maximum download delay to be set in case of high latencies -# AUTOTHROTTLE_MAX_DELAY = 60 +AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server -# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +AUTOTHROTTLE_TARGET_CONCURRENCY = CONCURRENT_REQUESTS_PER_DOMAIN # Enable showing throttling stats for every response received: -# AUTOTHROTTLE_DEBUG = False +AUTOTHROTTLE_DEBUG = False +AUTOTHROTTLE_HTTP_CODES = (429, 503, 504) # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -# HTTPCACHE_ENABLED = True -# HTTPCACHE_EXPIRATION_SECS = 0 +HTTPCACHE_ENABLED = True +HTTPCACHE_EXPIRATION_SECS = 60 * 60 * 24 * 7 # 1 week # HTTPCACHE_DIR = "httpcache" -# HTTPCACHE_IGNORE_HTTP_CODES = [] +HTTPCACHE_IGNORE_HTTP_CODES = (202, 408, 429, 500, 502, 503, 504) # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" +HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy" + +# Retry settings +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#retrymiddleware-settings +RETRY_ENABLED = True +RETRY_TIMES = 2 +RETRY_HTTP_CODES = (408, 429, 500, 502, 503, 504, 522, 524) +RETRY_PRIORITY_ADJUST = -1 + +# Delayed retry settings +DELAYED_RETRY_HTTP_CODES = (202,) +DELAYED_RETRY_TIMES = -1 +DELAYED_RETRY_PRIORITY_ADJUST = 0 +DELAYED_RETRY_DELAY = 10.0 +DELAYED_RETRY_BACKOFF = True +DELAYED_RETRY_BACKOFF_MAX_DELAY = 100.0 + +MEDIA_ALLOW_REDIRECTS = True + +# Image processing +# https://docs.scrapy.org/en/latest/topics/media-pipeline.html#using-the-images-pipeline +IMAGES_STORE = BASE_DIR / "images" +IMAGES_URLS_FIELD = "image_url_download" +IMAGES_RESULT_FIELD = "image_file" +IMAGES_EXPIRES = 360 +# IMAGES_THUMBS = {"thumb": (1024, 1024)} + +# BlurHash +BLURHASH_FIELD = "image_blurhash" +BLURHASH_X_COMPONENTS = 4 +BLURHASH_Y_COMPONENTS = 4 # Set settings whose default value is deprecated to a future-proof value REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"