From 4bfc4b65656420e6912face2f1edd06ebbf53dc4 Mon Sep 17 00:00:00 2001 From: Emmanuel Rondan Date: Thu, 29 Aug 2024 10:41:41 -0300 Subject: [PATCH 1/8] adding pre-commit hooks config files --- .bandit.yml | 3 +++ .flake8 | 2 ++ .git-blame-ignore-revs | 1 + .github/workflows/main.yml | 6 ++++++ .isort.cfg | 2 ++ .pre-commit-config.yaml | 12 ++++++++++++ tox.ini | 8 +++----- 7 files changed, 29 insertions(+), 5 deletions(-) create mode 100644 .flake8 create mode 100644 .git-blame-ignore-revs create mode 100644 .isort.cfg diff --git a/.bandit.yml b/.bandit.yml index 2237265..6116afa 100644 --- a/.bandit.yml +++ b/.bandit.yml @@ -1,3 +1,6 @@ skips: - B101 # assert_used, needed for mypy +- B311 +- B320 +- B410 exclude_dirs: ['tests'] diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..2bcd70e --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 88 diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000..e746ff9 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1 @@ +# applying pre-commit hooks to the project \ No newline at end of file diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b517eda..dd7e40f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -65,3 +65,9 @@ jobs: run: tox - name: Upload coverage.xml to codecov uses: codecov/codecov-action@v1 + + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: pre-commit/action@v3.0.0 diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 0000000..6860bdb --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,2 @@ +[settings] +profile = black \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e1dfbe6..3503b3a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,3 +4,15 @@ repos: hooks: - id: bandit args: [-r, -c, .bandit.yml] +- repo: https://github.com/psf/black.git + rev: 24.8.0 + hooks: + - id: black +- repo: https://github.com/PyCQA/flake8 + rev: 7.1.1 + hooks: + - id: flake8 +- repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort \ No newline at end of file diff --git a/tox.ini b/tox.ini index 6052f56..0c131f1 100644 --- a/tox.ini +++ b/tox.ini @@ -10,11 +10,9 @@ commands = py.test --doctest-modules --cov=scrapy_zyte_smartproxy {posargs:scrapy_zyte_smartproxy tests} [testenv:pre-commit] -basepython = python3 -deps = - pre-commit -commands = - pre-commit run {posargs:--all-files} +deps = pre-commit +commands = pre-commit run --all-files --show-diff-on-failure +skip_install = true [testenv:mypy] basepython = python3.10 From 951f357d3f60257618f7174f6ad39e9406441ecb Mon Sep 17 00:00:00 2001 From: Emmanuel Rondan Date: Thu, 29 Aug 2024 10:42:27 -0300 Subject: [PATCH 2/8] applying pre-commit hooks changes --- docs/conf.py | 69 +-- scrapy_zyte_smartproxy/__init__.py | 5 +- scrapy_zyte_smartproxy/middleware.py | 328 +++++++----- scrapy_zyte_smartproxy/utils.py | 5 +- setup.py | 64 +-- tests/test_all.py | 734 +++++++++++++++------------ 6 files changed, 671 insertions(+), 534 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index a605ed4..7899272 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,17 +12,18 @@ # All configuration values have a default; values that are commented out # serve to show the default. +import sys + # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # from os import path -import sys sys.path.insert(0, path.dirname(path.dirname(__file__))) -import sphinx_rtd_theme +import sphinx_rtd_theme # noqa: E402 html_theme = "sphinx_rtd_theme" @@ -38,25 +39,25 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autosectionlabel', + "sphinx.ext.autosectionlabel", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'scrapy-zyte-smartproxy' -copyright = u'2011-2021, Zyte Group Ltd' -author = u'Zyte' +project = "scrapy-zyte-smartproxy" +copyright = "2011-2021, Zyte Group Ltd" +author = "Zyte" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -65,19 +66,20 @@ try: import scrapy_zyte_smartproxy - version = '.'.join(scrapy_zyte_smartproxy.__version__.split('.')[:2]) + + version = ".".join(scrapy_zyte_smartproxy.__version__.split(".")[:2]) release = scrapy_zyte_smartproxy.__version__ except ImportError: - version = '' - release = '' + version = "" + release = "" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -98,13 +100,13 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -#html_static_path = ['_static'] +# html_static_path = ['_static'] # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. -htmlhelp_basename = 'scrapy-zyte-smartproxydoc' +htmlhelp_basename = "scrapy-zyte-smartproxydoc" # -- Options for LaTeX output --------------------------------------------- @@ -113,15 +115,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -133,10 +132,10 @@ latex_documents = [ ( master_doc, - 'scrapy-zyte-smartproxy.tex', - u'scrapy-zyte-smartproxy Documentation', - u'Zyte', - 'manual', + "scrapy-zyte-smartproxy.tex", + "scrapy-zyte-smartproxy Documentation", + "Zyte", + "manual", ), ] @@ -146,8 +145,13 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'scrapy-zyte-smartproxy', u'scrapy-zyte-smartproxy Documentation', - [author], 1) + ( + master_doc, + "scrapy-zyte-smartproxy", + "scrapy-zyte-smartproxy Documentation", + [author], + 1, + ) ] @@ -157,10 +161,13 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'scrapy-zyte-smartproxy', u'scrapy-zyte-smartproxy Documentation', - author, 'scrapy-zyte-smartproxy', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "scrapy-zyte-smartproxy", + "scrapy-zyte-smartproxy Documentation", + author, + "scrapy-zyte-smartproxy", + "One line description of project.", + "Miscellaneous", + ), ] - - - diff --git a/scrapy_zyte_smartproxy/__init__.py b/scrapy_zyte_smartproxy/__init__.py index 3b44f47..f4bd92b 100644 --- a/scrapy_zyte_smartproxy/__init__.py +++ b/scrapy_zyte_smartproxy/__init__.py @@ -1,4 +1 @@ -from .middleware import ZyteSmartProxyMiddleware - - -__version__ = '2.3.5' +__version__ = "2.3.5" diff --git a/scrapy_zyte_smartproxy/middleware.py b/scrapy_zyte_smartproxy/middleware.py index ddb9571..0aac2a6 100644 --- a/scrapy_zyte_smartproxy/middleware.py +++ b/scrapy_zyte_smartproxy/middleware.py @@ -1,24 +1,23 @@ -import os import logging +import os import warnings from base64 import urlsafe_b64decode from collections import defaultdict -from typing import Dict, List + try: from urllib.request import _parse_proxy # type: ignore except ImportError: from urllib2 import _parse_proxy # type: ignore -from six.moves.urllib.parse import urlparse, urlunparse -from w3lib.http import basic_auth_header from scrapy import signals -from scrapy.resolver import dnscache from scrapy.exceptions import ScrapyDeprecationWarning -from twisted.internet.error import ConnectionRefusedError, ConnectionDone +from scrapy.resolver import dnscache +from six.moves.urllib.parse import urlparse, urlunparse +from twisted.internet.error import ConnectionDone, ConnectionRefusedError +from w3lib.http import basic_auth_header from scrapy_zyte_smartproxy.utils import exp_backoff - logger = logging.getLogger(__name__) @@ -29,16 +28,16 @@ def _remove_auth(auth_proxy_url): class ZyteSmartProxyMiddleware(object): - url = 'http://proxy.zyte.com:8011' + url = "http://proxy.zyte.com:8011" maxbans = 400 ban_code = 503 download_timeout = 190 # Handle Zyte Smart Proxy Manager server failures connection_refused_delay = 90 preserve_delay = False - header_prefix = 'X-Crawlera-' # Deprecated - header_lowercase_prefixes = ('zyte-', 'x-crawlera-') - conflicting_headers = ('X-Crawlera-Profile', 'X-Crawlera-UA') + header_prefix = "X-Crawlera-" # Deprecated + header_lowercase_prefixes = ("zyte-", "x-crawlera-") + conflicting_headers = ("X-Crawlera-Profile", "X-Crawlera-UA") backoff_step = 15 backoff_max = 180 exp_backoff = None @@ -52,22 +51,24 @@ class ZyteSmartProxyMiddleware(object): b"zyte-jobid": b"x-crawlera-jobid", b"zyte-override-headers": b"x-crawlera-profile-pass", } - spm_to_zyte_api_translations = {v: k for k, v in zyte_api_to_spm_translations.items()} + spm_to_zyte_api_translations = { + v: k for k, v in zyte_api_to_spm_translations.items() + } _settings = [ - ('apikey', str), - ('url', str), - ('maxbans', int), - ('download_timeout', int), - ('preserve_delay', bool), - ('backoff_step', int), - ('backoff_max', int), - ('force_enable_on_http_codes', list), + ("apikey", str), + ("url", str), + ("maxbans", int), + ("download_timeout", int), + ("preserve_delay", bool), + ("backoff_step", int), + ("backoff_max", int), + ("force_enable_on_http_codes", list), ] def __init__(self, crawler): self.crawler = crawler - self.job_id = os.environ.get('SCRAPY_JOB') + self.job_id = os.environ.get("SCRAPY_JOB") self.spider = None self._bans = defaultdict(int) self._saved_delays = defaultdict(lambda: None) @@ -85,14 +86,14 @@ def from_crawler(cls, crawler): def _make_auth_url(self, spider): parsed_url = urlparse(self.url) auth = self.get_proxyauth(spider) - if not auth.startswith(b'Basic '): + if not auth.startswith(b"Basic "): raise ValueError( - 'Zyte proxy services only support HTTP basic access ' - 'authentication, but %s.%s.get_proxyauth() returned %r' + "Zyte proxy services only support HTTP basic access " + "authentication, but %s.%s.get_proxyauth() returned %r" % (self.__module__, self.__class__.__name__, auth) ) - user_and_colon = urlsafe_b64decode(auth[6:].strip()).decode('utf-8') - netloc = user_and_colon + '@' + parsed_url.netloc.split('@')[-1] + user_and_colon = urlsafe_b64decode(auth[6:].strip()).decode("utf-8") + netloc = user_and_colon + "@" + parsed_url.netloc.split("@")[-1] parsed_url = parsed_url._replace(netloc=netloc) return urlunparse(parsed_url) @@ -104,7 +105,9 @@ def open_spider(self, spider): setattr(self, k, self._get_setting_value(spider, k, type_)) self._fix_url_protocol() - self._headers = self.crawler.settings.get('ZYTE_SMARTPROXY_DEFAULT_HEADERS', {}).items() + self._headers = self.crawler.settings.get( + "ZYTE_SMARTPROXY_DEFAULT_HEADERS", {} + ).items() self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max) if not self.enabled and not self.force_enable_on_http_codes: @@ -113,7 +116,7 @@ def open_spider(self, spider): if not self.apikey: logger.warning( "Zyte proxy services cannot be used without an API key", - extra={'spider': spider}, + extra={"spider": spider}, ) return @@ -121,10 +124,9 @@ def open_spider(self, spider): self._authless_url = _remove_auth(self._auth_url) logger.info( - "Using Zyte proxy service %s with an API key ending in %s" % ( - self.url, self.apikey[:7] - ), - extra={'spider': spider}, + "Using Zyte proxy service %s with an API key ending in %s" + % (self.url, self.apikey[:7]), + extra={"spider": spider}, ) if not self.preserve_delay: @@ -136,7 +138,7 @@ def open_spider(self, spider): "To avoid this behaviour you can use the " "ZYTE_SMARTPROXY_PRESERVE_DELAY setting, but keep in mind " "that this may slow down the crawl significantly", - extra={'spider': spider}, + extra={"spider": spider}, ) def _settings_get(self, type_, *a, **kw): @@ -152,49 +154,69 @@ def _settings_get(self, type_, *a, **kw): return self.crawler.settings.get(*a, **kw) def _get_setting_value(self, spider, k, type_): - if hasattr(spider, 'hubproxy_' + k): - warnings.warn('hubproxy_%s attribute is deprecated, ' - 'use zyte_smartproxy_%s instead.' % (k, k), - category=ScrapyDeprecationWarning, stacklevel=1) + if hasattr(spider, "hubproxy_" + k): + warnings.warn( + "hubproxy_%s attribute is deprecated, " + "use zyte_smartproxy_%s instead." % (k, k), + category=ScrapyDeprecationWarning, + stacklevel=1, + ) - if self.crawler.settings.get('HUBPROXY_%s' % k.upper()) is not None: - warnings.warn('HUBPROXY_%s setting is deprecated, ' - 'use ZYTE_SMARTPROXY_%s instead.' % (k.upper(), k.upper()), - category=ScrapyDeprecationWarning, stacklevel=1) + if self.crawler.settings.get("HUBPROXY_%s" % k.upper()) is not None: + warnings.warn( + "HUBPROXY_%s setting is deprecated, " + "use ZYTE_SMARTPROXY_%s instead." % (k.upper(), k.upper()), + category=ScrapyDeprecationWarning, + stacklevel=1, + ) o = getattr(self, k, None) s = self._settings_get( - type_, 'ZYTE_SMARTPROXY_' + k.upper(), self._settings_get( - type_, 'HUBPROXY_' + k.upper(), o)) + type_, + "ZYTE_SMARTPROXY_" + k.upper(), + self._settings_get(type_, "HUBPROXY_" + k.upper(), o), + ) return getattr( - spider, 'zyte_smartproxy_' + k, getattr(spider, 'hubproxy_' + k, s)) + spider, "zyte_smartproxy_" + k, getattr(spider, "hubproxy_" + k, s) + ) def _fix_url_protocol(self): - if self.url.startswith('https://'): - logger.warning('ZYTE_SMARTPROXY_URL "%s" set with "https://" protocol.' % self.url) - elif not self.url.startswith('http://'): + if self.url.startswith("https://"): + logger.warning( + 'ZYTE_SMARTPROXY_URL "%s" set with "https://" protocol.' % self.url + ) + elif not self.url.startswith("http://"): logger.warning('Adding "http://" to ZYTE_SMARTPROXY_URL %s' % self.url) - self.url = 'http://' + self.url + self.url = "http://" + self.url def is_enabled(self, spider): """Hook to enable middleware by custom rules.""" - if hasattr(spider, 'use_hubproxy'): - warnings.warn('use_hubproxy attribute is deprecated, ' - 'use zyte_smartproxy_enabled instead.', - category=ScrapyDeprecationWarning, stacklevel=1) - - if self.crawler.settings.get('HUBPROXY_ENABLED') is not None: - warnings.warn('HUBPROXY_ENABLED setting is deprecated, ' - 'use ZYTE_SMARTPROXY_ENABLED instead.', - category=ScrapyDeprecationWarning, stacklevel=1) - return ( - getattr(spider, 'zyte_smartproxy_enabled', self.crawler.settings.getbool('ZYTE_SMARTPROXY_ENABLED')) or - getattr(spider, 'use_hubproxy', self.crawler.settings.getbool("HUBPROXY_ENABLED")) + if hasattr(spider, "use_hubproxy"): + warnings.warn( + "use_hubproxy attribute is deprecated, " + "use zyte_smartproxy_enabled instead.", + category=ScrapyDeprecationWarning, + stacklevel=1, + ) + + if self.crawler.settings.get("HUBPROXY_ENABLED") is not None: + warnings.warn( + "HUBPROXY_ENABLED setting is deprecated, " + "use ZYTE_SMARTPROXY_ENABLED instead.", + category=ScrapyDeprecationWarning, + stacklevel=1, + ) + return getattr( + spider, + "zyte_smartproxy_enabled", + self.crawler.settings.getbool("ZYTE_SMARTPROXY_ENABLED"), + ) or getattr( + spider, "use_hubproxy", self.crawler.settings.getbool("HUBPROXY_ENABLED") ) def get_proxyauth(self, spider): """Hook to compute Proxy-Authorization header by custom rules.""" - return basic_auth_header(self.apikey, '') + return basic_auth_header(self.apikey, "") def _targets_zyte_api(self, request): if self._auth_url is None: @@ -208,7 +230,8 @@ def _targets_zyte_api(self, request): def _translate_headers(self, request, targets_zyte_api): translation_dict = ( - self.spm_to_zyte_api_translations if targets_zyte_api + self.spm_to_zyte_api_translations + if targets_zyte_api else self.zyte_api_to_spm_translations ) for header, translation in translation_dict.items(): @@ -216,7 +239,7 @@ def _translate_headers(self, request, targets_zyte_api): continue request.headers[translation] = value = request.headers.pop(header) logger.warning( - "Translating (and dropping) header %r (%r) as %r on request %r", + "Translating header %r (%r) to %r on request %r", header, value, translation, @@ -229,10 +252,10 @@ def _inc_stat(self, stat, targets_zyte_api, value=1): def process_request(self, request, spider): if self._is_enabled_for_request(request): - if 'proxy' not in request.meta: - request.meta['proxy'] = self._auth_url + if "proxy" not in request.meta: + request.meta["proxy"] = self._auth_url elif ( - request.meta['proxy'] == self._authless_url + request.meta["proxy"] == self._authless_url and b"Proxy-Authorization" not in request.headers ): logger.warning( @@ -243,55 +266,60 @@ def process_request(self, request, spider): "middlewares from one request to another is a bad " "practice that can cause issues.".format(request=request) ) - request.meta['proxy'] = self._auth_url + request.meta["proxy"] = self._auth_url targets_zyte_api = self._targets_zyte_api(request) self._set_zyte_smartproxy_default_headers(request) - request.meta['download_timeout'] = self.download_timeout + request.meta["download_timeout"] = self.download_timeout if self.job_id: - job_header = 'Zyte-JobId' if targets_zyte_api else 'X-Crawlera-JobId' + job_header = "Zyte-JobId" if targets_zyte_api else "X-Crawlera-JobId" request.headers[job_header] = self.job_id - user_agent_header = "Zyte-Client" if targets_zyte_api else "X-Crawlera-Client" + user_agent_header = ( + "Zyte-Client" if targets_zyte_api else "X-Crawlera-Client" + ) from scrapy_zyte_smartproxy import __version__ - request.headers[user_agent_header] = 'scrapy-zyte-smartproxy/%s' % __version__ + + request.headers[user_agent_header] = ( + "scrapy-zyte-smartproxy/%s" % __version__ + ) self._inc_stat("request", targets_zyte_api=targets_zyte_api) - self._inc_stat("request/method/{}".format(request.method), targets_zyte_api=targets_zyte_api) + self._inc_stat( + "request/method/{}".format(request.method), + targets_zyte_api=targets_zyte_api, + ) self._translate_headers(request, targets_zyte_api=targets_zyte_api) - self._clean_zyte_smartproxy_headers(request, targets_zyte_api=targets_zyte_api) + self._clean_zyte_smartproxy_headers( + request, targets_zyte_api=targets_zyte_api + ) else: self._clean_zyte_smartproxy_headers(request) def _is_banned(self, response): return ( response.status == self.ban_code - and response.headers.get('X-Crawlera-Error') == b'banned' - ) or ( - response.status in {520, 521} - and response.headers.get('Zyte-Error') - ) + and response.headers.get("X-Crawlera-Error") == b"banned" + ) or (response.status in {520, 521} and response.headers.get("Zyte-Error")) def _is_auth_error(self, response): return ( - response.status == 407 and - response.headers.get('X-Crawlera-Error') == b'bad_proxy_auth' + response.status == 407 + and response.headers.get("X-Crawlera-Error") == b"bad_proxy_auth" ) def _throttle_error(self, response): - error = response.headers.get('Zyte-Error') or response.headers.get('X-Crawlera-Error') - if ( - response.status in {429, 503} - and error - and error != b"banned" - ): + error = response.headers.get("Zyte-Error") or response.headers.get( + "X-Crawlera-Error" + ) + if response.status in {429, 503} and error and error != b"banned": return error.decode() return None def _process_error(self, response): if "Zyte-Error" in response.headers: - value = response.headers.get('Zyte-Error') + value = response.headers.get("Zyte-Error") response.headers["X-Crawlera-Error"] = value return value if "X-Crawlera-Error" in response.headers: - value = response.headers.get('X-Crawlera-Error') + value = response.headers.get("X-Crawlera-Error") response.headers["Zyte-Error"] = value return value return None @@ -302,7 +330,9 @@ def process_response(self, request, response, spider): targets_zyte_api = self._targets_zyte_api(request) if not self._is_enabled_for_request(request): - return self._handle_not_enabled_response(request, response, targets_zyte_api=targets_zyte_api) + return self._handle_not_enabled_response( + request, response, targets_zyte_api=targets_zyte_api + ) if not self._is_zyte_smartproxy_or_zapi_response(response): return response @@ -314,11 +344,16 @@ def process_response(self, request, response, spider): throttle_error = self._throttle_error(response) if is_auth_error or throttle_error: if is_auth_error: - reason = 'autherror' + reason = "autherror" else: assert throttle_error reason = throttle_error.lstrip("/") - self._set_custom_delay(request, next(self.exp_backoff), reason=reason, targets_zyte_api=targets_zyte_api) + self._set_custom_delay( + request, + next(self.exp_backoff), + reason=reason, + targets_zyte_api=targets_zyte_api, + ) else: self._inc_stat("delay/reset_backoff", targets_zyte_api=targets_zyte_api) self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max) @@ -326,35 +361,51 @@ def process_response(self, request, response, spider): if is_auth_error: # When Zyte Smart Proxy Manager has issues it might not be able to # authenticate users we must retry - retries = request.meta.get('zyte_smartproxy_auth_retry_times', 0) + retries = request.meta.get("zyte_smartproxy_auth_retry_times", 0) if retries < self.max_auth_retry_times: - return self._retry_auth(response, request, spider, targets_zyte_api=targets_zyte_api) + return self._retry_auth( + response, request, spider, targets_zyte_api=targets_zyte_api + ) else: - self._inc_stat("retries/auth/max_reached", targets_zyte_api=targets_zyte_api) + self._inc_stat( + "retries/auth/max_reached", targets_zyte_api=targets_zyte_api + ) logger.warning( - "Max retries for authentication issues reached, please check auth" - " information settings", - extra={'spider': self.spider}, + "Max retries for authentication issues reached," + "please check auth information settings", + extra={"spider": self.spider}, ) if self._is_banned(response): self._bans[key] += 1 if self._bans[key] > self.maxbans: - self.crawler.engine.close_spider(spider, 'banned') + self.crawler.engine.close_spider(spider, "banned") else: - after = response.headers.get('retry-after') + after = response.headers.get("retry-after") if after: - self._set_custom_delay(request, float(after), reason='banned', targets_zyte_api=targets_zyte_api) + self._set_custom_delay( + request, + float(after), + reason="banned", + targets_zyte_api=targets_zyte_api, + ) self._inc_stat("response/banned", targets_zyte_api=targets_zyte_api) else: self._bans[key] = 0 - # If placed behind `RedirectMiddleware`, it would not count 3xx responses + # If placed behind `RedirectMiddleware`, + # it would not count 3xx responses self._inc_stat("response", targets_zyte_api=targets_zyte_api) - self._inc_stat("response/status/{}".format(response.status), targets_zyte_api=targets_zyte_api) + self._inc_stat( + "response/status/{}".format(response.status), + targets_zyte_api=targets_zyte_api, + ) if zyte_smartproxy_error: self._inc_stat("response/error", targets_zyte_api=targets_zyte_api) - error_msg = zyte_smartproxy_error.decode('utf8') - self._inc_stat("response/error/{}".format(error_msg), targets_zyte_api=targets_zyte_api) + error_msg = zyte_smartproxy_error.decode("utf8") + self._inc_stat( + "response/error/{}".format(error_msg), + targets_zyte_api=targets_zyte_api, + ) return response def process_exception(self, request, exception, spider): @@ -364,7 +415,12 @@ def process_exception(self, request, exception, spider): # Handle Zyte Smart Proxy Manager downtime self._clear_dns_cache() targets_zyte_api = self._targets_zyte_api(request) - self._set_custom_delay(request, self.connection_refused_delay, reason='conn_refused', targets_zyte_api=targets_zyte_api) + self._set_custom_delay( + request, + self.connection_refused_delay, + reason="conn_refused", + targets_zyte_api=targets_zyte_api, + ) def _handle_not_enabled_response(self, request, response, targets_zyte_api): if self._should_enable_for_response(response): @@ -373,7 +429,10 @@ def _handle_not_enabled_response(self, request, response, targets_zyte_api): retryreq = request.copy() retryreq.dont_filter = True - self._inc_stat("retries/should_have_been_enabled", targets_zyte_api=targets_zyte_api) + self._inc_stat( + "retries/should_have_been_enabled", + targets_zyte_api=targets_zyte_api, + ) return retryreq return response @@ -383,11 +442,11 @@ def _retry_auth(self, response, request, spider, targets_zyte_api): "Retrying a request due to an authentication issue with " "the configured Zyte proxy service" ), - extra={'spider': self.spider}, + extra={"spider": self.spider}, ) - retries = request.meta.get('zyte_smartproxy_auth_retry_times', 0) + 1 + retries = request.meta.get("zyte_smartproxy_auth_retry_times", 0) + 1 retryreq = request.copy() - retryreq.meta['zyte_smartproxy_auth_retry_times'] = retries + retryreq.meta["zyte_smartproxy_auth_retry_times"] = retries retryreq.dont_filter = True self._inc_stat("retries/auth", targets_zyte_api=targets_zyte_api) return retryreq @@ -403,7 +462,7 @@ def _should_enable_for_response(self, response): def _is_enabled_for_request(self, request): domain = self._get_url_domain(request.url) domain_enabled = self.enabled_for_domain.get(domain, False) - dont_proxy = request.meta.get('dont_proxy', False) + dont_proxy = request.meta.get("dont_proxy", False) return (domain_enabled or self.enabled) and not dont_proxy def _get_url_domain(self, url): @@ -418,7 +477,7 @@ def _is_zyte_smartproxy_or_zapi_response(self, response): ) def _get_slot_key(self, request): - return request.meta.get('download_slot') + return request.meta.get("download_slot") def _get_slot(self, request): key = self._get_slot_key(request) @@ -434,7 +493,11 @@ def _set_custom_delay(self, request, delay, targets_zyte_api, reason=None): slot.delay = delay if reason is not None: self._inc_stat("delay/{}".format(reason), targets_zyte_api=targets_zyte_api) - self._inc_stat("delay/{}/total".format(reason), value=delay, targets_zyte_api=targets_zyte_api) + self._inc_stat( + "delay/{}/total".format(reason), + value=delay, + targets_zyte_api=targets_zyte_api, + ) def _restore_original_delay(self, request): """Restore original delay for slot if it was changed.""" @@ -449,9 +512,9 @@ def _clean_zyte_smartproxy_headers(self, request, targets_zyte_api=None): if targets_zyte_api is None: prefixes = self.header_lowercase_prefixes elif targets_zyte_api: - prefixes = ('x-crawlera-',) + prefixes = ("x-crawlera-",) else: - prefixes = ('zyte-',) + prefixes = ("zyte-",) targets = [ header for header in request.headers @@ -471,7 +534,8 @@ def _clean_zyte_smartproxy_headers(self, request, targets_zyte_api=None): "request is proxied with %s and not with %s, and " "automatic translation is not supported for this " "header. See " - "https://docs.zyte.com/zyte-api/migration/zyte/smartproxy.html#parameter-mapping" + "https://docs.zyte.com/zyte-api/migration/zyte/" + "smartproxy.html#parameter-mapping" " to learn the right way to translate this header " "manually." ), @@ -485,11 +549,8 @@ def _clean_zyte_smartproxy_headers(self, request, targets_zyte_api=None): def _is_zyte_smartproxy_header(self, header_name, prefixes): if not header_name: return False - header_name = header_name.decode('utf-8').lower() - return any( - header_name.startswith(prefix) - for prefix in prefixes - ) + header_name = header_name.decode("utf-8").lower() + return any(header_name.startswith(prefix) for prefix in prefixes) def _set_zyte_smartproxy_default_headers(self, request): for header, value in self._headers: @@ -497,23 +558,24 @@ def _set_zyte_smartproxy_default_headers(self, request): continue request.headers.setdefault(header, value) lower_case_headers = [ - header.decode('utf-8').lower() for header in request.headers + header.decode("utf-8").lower() for header in request.headers ] if all(h.lower() in lower_case_headers for h in self.conflicting_headers): - # Send a general warning once, and specific urls if LOG_LEVEL = DEBUG + # Send a general warning once, + # and specific urls if LOG_LEVEL = DEBUG warnings.warn( - 'The headers %s are conflicting on some of your requests. ' - 'Please check ' - 'https://docs.zyte.com/smart-proxy-manager.html#request-headers ' - 'for more information. You can set LOG_LEVEL=DEBUG to see the ' - 'urls with problems.' - % str(self.conflicting_headers) + "The headers %s are conflicting on some of your requests. " + "Please check " + "https://docs.zyte.com/smart-proxy-manager.html" + "#request-headers " + "for more information. You can set LOG_LEVEL=DEBUG to see the " + "urls with problems." % str(self.conflicting_headers) ) logger.debug( - 'The headers %s are conflicting on request %s. X-Crawlera-UA ' - 'will be ignored. Please check ' - 'https://docs.zyte.com/smart-proxy-manager.html#request-headers ' - 'for more information' + "The headers %s are conflicting on request %s. X-Crawlera-UA " + "will be ignored. Please check " + "https://docs.zyte.com/smart-proxy-manager.html" + "#request-headers for more information" % (str(self.conflicting_headers), request.url), - extra={'spider': self.spider}, + extra={"spider": self.spider}, ) diff --git a/scrapy_zyte_smartproxy/utils.py b/scrapy_zyte_smartproxy/utils.py index fa37659..d6a49cc 100644 --- a/scrapy_zyte_smartproxy/utils.py +++ b/scrapy_zyte_smartproxy/utils.py @@ -1,16 +1,15 @@ import math import random - from itertools import count def exp_backoff(step, max): - """ Exponential backoff time with Full Jitter """ + """Exponential backoff time with Full Jitter""" # this is a numerically stable version of # random.uniform(0, min(max, step * 2 ** attempt)) max_attempts = math.log(max / step, 2) for attempt in count(0, 1): if attempt <= max_attempts: - yield random.uniform(0, step * 2 ** attempt) # nosec + yield random.uniform(0, step * 2**attempt) # nosec else: yield random.uniform(0, max) # nosec diff --git a/setup.py b/setup.py index 91755d8..b0192b8 100644 --- a/setup.py +++ b/setup.py @@ -5,39 +5,39 @@ setup( - name='scrapy-zyte-smartproxy', - version='2.3.5', - license='BSD', - description='Scrapy middleware for Zyte Smart Proxy Manager', + name="scrapy-zyte-smartproxy", + version="2.3.5", + license="BSD", + description="Scrapy middleware for Zyte Smart Proxy Manager", long_description=readme, - maintainer='Raul Gallegos', - maintainer_email='raul.ogh@gmail.com', - author='Zyte', - author_email='opensource@zyte.com', - url='https://github.com/scrapy-plugins/scrapy-zyte-smartproxy', - packages=['scrapy_zyte_smartproxy'], - platforms=['Any'], + maintainer="Raul Gallegos", + maintainer_email="raul.ogh@gmail.com", + author="Zyte", + author_email="opensource@zyte.com", + url="https://github.com/scrapy-plugins/scrapy-zyte-smartproxy", + packages=["scrapy_zyte_smartproxy"], + platforms=["Any"], classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'License :: OSI Approved :: BSD License', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Programming Language :: Python :: 3.12', - 'Framework :: Scrapy', - 'Intended Audience :: Developers', - 'Topic :: Internet :: WWW/HTTP', - 'Topic :: Internet :: Proxy Servers', - 'Topic :: Software Development :: Libraries :: Application Frameworks', - 'Topic :: Software Development :: Libraries :: Python Modules', + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Framework :: Scrapy", + "Intended Audience :: Developers", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Internet :: Proxy Servers", + "Topic :: Software Development :: Libraries :: Application Frameworks", + "Topic :: Software Development :: Libraries :: Python Modules", ], - install_requires=['scrapy>=1.4.0', 'six', 'w3lib'], + install_requires=["scrapy>=1.4.0", "six", "w3lib"], ) diff --git a/tests/test_all.py b/tests/test_all.py index bedc8c6..71e1fd9 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -1,26 +1,26 @@ import binascii import os -import pytest from copy import copy from random import choice from unittest import TestCase + +import pytest + try: from unittest.mock import call, patch # type: ignore except ImportError: from mock import call, patch # type: ignore -from w3lib.http import basic_auth_header from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware +from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.http import Request, Response +from scrapy.resolver import dnscache from scrapy.spiders import Spider from scrapy.utils.test import get_crawler -from scrapy.resolver import dnscache -from scrapy.exceptions import ScrapyDeprecationWarning -from twisted.internet.error import ConnectionRefusedError, ConnectionDone - -from scrapy_zyte_smartproxy import __version__, ZyteSmartProxyMiddleware -from scrapy_zyte_smartproxy.utils import exp_backoff +from twisted.internet.error import ConnectionDone, ConnectionRefusedError +from w3lib.http import basic_auth_header +from scrapy_zyte_smartproxy import ZyteSmartProxyMiddleware, __version__ RESPONSE_IDENTIFYING_HEADERS = ( ("X-Crawlera-Version", None), @@ -44,12 +44,14 @@ class ZyteSmartProxyMiddlewareTestCase(TestCase): auth_error_code = 407 def setUp(self): - self.spider = Spider('foo') - self.settings = {'ZYTE_SMARTPROXY_APIKEY': 'apikey'} + self.spider = Spider("foo") + self.settings = {"ZYTE_SMARTPROXY_APIKEY": "apikey"} Response_init_orig = Response.__init__ def Response_init_new(self, *args, **kwargs): - assert not kwargs.get('request'), 'response objects at this stage shall not be pinned' + assert not kwargs.get( + "request" + ), "response objects at this stage shall not be pinned" return Response_init_orig(self, *args, **kwargs) Response.__init__ = Response_init_new @@ -81,62 +83,67 @@ def _assert_disabled(self, spider, settings=None): crawler = self._mock_crawler(spider, settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) - req = Request('http://example.com') + req = Request("http://example.com") out = mw.process_request(req, spider) self.assertEqual(out, None) - self.assertEqual(req.meta.get('proxy'), None) - self.assertEqual(req.meta.get('download_timeout'), None) - self.assertEqual(req.headers.get('Proxy-Authorization'), None) + self.assertEqual(req.meta.get("proxy"), None) + self.assertEqual(req.meta.get("download_timeout"), None) + self.assertEqual(req.headers.get("Proxy-Authorization"), None) res = Response(req.url) assert mw.process_response(req, res, spider) is res res = Response(req.url, status=mw.ban_code) assert mw.process_response(req, res, spider) is res - def _assert_enabled(self, spider, - settings=None, - proxyurl='http://proxy.zyte.com:8011', - proxyurlcreds='http://apikey:@proxy.zyte.com:8011', - proxyauth=basic_auth_header('apikey', ''), - maxbans=400, - download_timeout=190): + def _assert_enabled( + self, + spider, + settings=None, + proxyurl="http://proxy.zyte.com:8011", + proxyurlcreds="http://apikey:@proxy.zyte.com:8011", + proxyauth=basic_auth_header("apikey", ""), + maxbans=400, + download_timeout=190, + ): crawler = self._mock_crawler(spider, settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) assert mw.url == proxyurl - req = Request('http://example.com') + req = Request("http://example.com") assert mw.process_request(req, spider) is None - self.assertEqual(req.meta.get('proxy'), proxyurlcreds) - self.assertEqual(req.meta.get('download_timeout'), download_timeout) - self.assertNotIn(b'Proxy-Authorization', req.headers) + self.assertEqual(req.meta.get("proxy"), proxyurlcreds) + self.assertEqual(req.meta.get("download_timeout"), download_timeout) + self.assertNotIn(b"Proxy-Authorization", req.headers) res = self._mock_zyte_smartproxy_response(req.url) assert mw.process_response(req, res, spider) is res # disabled if 'dont_proxy=True' is set - req = Request('http://example.com') - req.meta['dont_proxy'] = True + req = Request("http://example.com") + req.meta["dont_proxy"] = True assert mw.process_request(req, spider) is None assert httpproxy.process_request(req, spider) is None - self.assertEqual(req.meta.get('proxy'), None) - self.assertEqual(req.meta.get('download_timeout'), None) - self.assertNotIn(b'Proxy-Authorization', req.headers) + self.assertEqual(req.meta.get("proxy"), None) + self.assertEqual(req.meta.get("download_timeout"), None) + self.assertNotIn(b"Proxy-Authorization", req.headers) res = self._mock_zyte_smartproxy_response(req.url) assert mw.process_response(req, res, spider) is res - del req.meta['dont_proxy'] + del req.meta["dont_proxy"] assert mw.process_request(req, spider) is None assert httpproxy.process_request(req, spider) is None - self.assertEqual(req.meta.get('proxy'), proxyurl) - self.assertEqual(req.meta.get('download_timeout'), download_timeout) - self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth) + self.assertEqual(req.meta.get("proxy"), proxyurl) + self.assertEqual(req.meta.get("download_timeout"), download_timeout) + self.assertEqual(req.headers.get("Proxy-Authorization"), proxyauth) if maxbans > 0: # assert ban count is reseted after a succesful response - res = self._mock_zyte_smartproxy_response('http://banned.example', status=self.bancode) + res = self._mock_zyte_smartproxy_response( + "http://banned.example", status=self.bancode + ) assert mw.process_response(req, res, spider) is res self.assertEqual(crawler.engine.fake_spider_closed_result, None) - res = self._mock_zyte_smartproxy_response('http://unbanned.example') + res = self._mock_zyte_smartproxy_response("http://unbanned.example") assert mw.process_response(req, res, spider) is res self.assertEqual(crawler.engine.fake_spider_closed_result, None) self.assertEqual(mw._bans[None], 0) @@ -145,22 +152,22 @@ def _assert_enabled(self, spider, for x in range(maxbans + 1): self.assertEqual(crawler.engine.fake_spider_closed_result, None) res = self._mock_zyte_smartproxy_response( - 'http://banned.example/%d' % x, + "http://banned.example/%d" % x, status=self.bancode, - headers={'X-Crawlera-Error': 'banned'}, + headers={"X-Crawlera-Error": "banned"}, ) assert mw.process_response(req, res, spider) is res assert res.headers["X-Crawlera-Error"] == b"banned" assert res.headers["Zyte-Error"] == b"banned" # max bans reached and close_spider called - self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, 'banned')) + self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, "banned")) def test_disabled_by_lack_of_zyte_smartproxy_settings(self): self._assert_disabled(self.spider, settings={}) def test_spider_zyte_smartproxy_enabled(self): - self.assertFalse(hasattr(self.spider, 'zyte_smartproxy_enabled')) + self.assertFalse(hasattr(self.spider, "zyte_smartproxy_enabled")) self._assert_disabled(self.spider, self.settings) self.spider.zyte_smartproxy_enabled = True self._assert_enabled(self.spider, self.settings) @@ -169,89 +176,119 @@ def test_spider_zyte_smartproxy_enabled(self): def test_enabled(self): self._assert_disabled(self.spider, self.settings) - self.settings['ZYTE_SMARTPROXY_ENABLED'] = True + self.settings["ZYTE_SMARTPROXY_ENABLED"] = True self._assert_enabled(self.spider, self.settings) def test_spider_zyte_smartproxy_enabled_priority(self): self.spider.zyte_smartproxy_enabled = False - self.settings['ZYTE_SMARTPROXY_ENABLED'] = True + self.settings["ZYTE_SMARTPROXY_ENABLED"] = True self._assert_disabled(self.spider, self.settings) self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_ENABLED'] = False + self.settings["ZYTE_SMARTPROXY_ENABLED"] = False self._assert_enabled(self.spider, self.settings) self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_ENABLED'] = True + self.settings["ZYTE_SMARTPROXY_ENABLED"] = True self._assert_enabled(self.spider, self.settings) self.spider.zyte_smartproxy_enabled = False - self.settings['ZYTE_SMARTPROXY_ENABLED'] = False + self.settings["ZYTE_SMARTPROXY_ENABLED"] = False self._assert_disabled(self.spider, self.settings) def test_apikey(self): self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_APIKEY'] = apikey = 'apikey' - proxyauth = basic_auth_header(apikey, '') - self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth, proxyurlcreds='http://apikey:@proxy.zyte.com:8011') + self.settings["ZYTE_SMARTPROXY_APIKEY"] = apikey = "apikey" + proxyauth = basic_auth_header(apikey, "") + self._assert_enabled( + self.spider, + self.settings, + proxyauth=proxyauth, + proxyurlcreds="http://apikey:@proxy.zyte.com:8011", + ) - apikey = 'notfromsettings' - proxyauth = basic_auth_header(apikey, '') + apikey = "notfromsettings" + proxyauth = basic_auth_header(apikey, "") self.spider.zyte_smartproxy_apikey = apikey - self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth, proxyurlcreds='http://notfromsettings:@proxy.zyte.com:8011') + self._assert_enabled( + self.spider, + self.settings, + proxyauth=proxyauth, + proxyurlcreds="http://notfromsettings:@proxy.zyte.com:8011", + ) def test_proxyurl(self): self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_URL'] = 'http://localhost:8011' - self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8011', proxyurlcreds='http://apikey:@localhost:8011') + self.settings["ZYTE_SMARTPROXY_URL"] = "http://localhost:8011" + self._assert_enabled( + self.spider, + self.settings, + proxyurl="http://localhost:8011", + proxyurlcreds="http://apikey:@localhost:8011", + ) def test_proxyurl_no_protocol(self): self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_URL'] = 'localhost:8011' - self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8011', proxyurlcreds='http://apikey:@localhost:8011') + self.settings["ZYTE_SMARTPROXY_URL"] = "localhost:8011" + self._assert_enabled( + self.spider, + self.settings, + proxyurl="http://localhost:8011", + proxyurlcreds="http://apikey:@localhost:8011", + ) def test_proxyurl_https(self): self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_URL'] = 'https://localhost:8011' - self._assert_enabled(self.spider, self.settings, proxyurl='https://localhost:8011', proxyurlcreds='https://apikey:@localhost:8011') + self.settings["ZYTE_SMARTPROXY_URL"] = "https://localhost:8011" + self._assert_enabled( + self.spider, + self.settings, + proxyurl="https://localhost:8011", + proxyurlcreds="https://apikey:@localhost:8011", + ) def test_proxyurl_including_noconnect(self): self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_URL'] = 'http://localhost:8011?noconnect' - self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8011?noconnect', proxyurlcreds='http://apikey:@localhost:8011?noconnect') + self.settings["ZYTE_SMARTPROXY_URL"] = "http://localhost:8011?noconnect" + self._assert_enabled( + self.spider, + self.settings, + proxyurl="http://localhost:8011?noconnect", + proxyurlcreds="http://apikey:@localhost:8011?noconnect", + ) def test_maxbans(self): self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_MAXBANS'] = maxbans = 0 + self.settings["ZYTE_SMARTPROXY_MAXBANS"] = maxbans = 0 self._assert_enabled(self.spider, self.settings, maxbans=maxbans) - self.settings['ZYTE_SMARTPROXY_MAXBANS'] = maxbans = 100 + self.settings["ZYTE_SMARTPROXY_MAXBANS"] = maxbans = 100 self._assert_enabled(self.spider, self.settings, maxbans=maxbans) # Assert setting is coerced into correct type - self.settings['ZYTE_SMARTPROXY_MAXBANS'] = '123' + self.settings["ZYTE_SMARTPROXY_MAXBANS"] = "123" self._assert_enabled(self.spider, self.settings, maxbans=123) self.spider.zyte_smartproxy_maxbans = 99 self._assert_enabled(self.spider, self.settings, maxbans=99) def test_download_timeout(self): self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_DOWNLOAD_TIMEOUT'] = 60 + self.settings["ZYTE_SMARTPROXY_DOWNLOAD_TIMEOUT"] = 60 self._assert_enabled(self.spider, self.settings, download_timeout=60) # Assert setting is coerced into correct type - self.settings['ZYTE_SMARTPROXY_DOWNLOAD_TIMEOUT'] = '42' + self.settings["ZYTE_SMARTPROXY_DOWNLOAD_TIMEOUT"] = "42" self._assert_enabled(self.spider, self.settings, download_timeout=42) self.spider.zyte_smartproxy_download_timeout = 120 self._assert_enabled(self.spider, self.settings, download_timeout=120) def test_hooks(self): - proxyauth = basic_auth_header('foo', '') + proxyauth = basic_auth_header("foo", "") class _ECLS(self.mwcls): def is_enabled(self, spider): - wascalled.append('is_enabled') + wascalled.append("is_enabled") return enabled def get_proxyauth(self, spider): - wascalled.append('get_proxyauth') + wascalled.append("get_proxyauth") return proxyauth wascalled = [] @@ -261,19 +298,24 @@ def get_proxyauth(self, spider): enabled = False self.spider.zyte_smartproxy_enabled = True self._assert_disabled(self.spider, self.settings) - self.assertEqual(wascalled, ['is_enabled']) + self.assertEqual(wascalled, ["is_enabled"]) wascalled[:] = [] # reset enabled = True self.spider.zyte_smartproxy_enabled = False - self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth, proxyurlcreds='http://foo:@proxy.zyte.com:8011') - self.assertEqual(wascalled, ['is_enabled', 'get_proxyauth']) + self._assert_enabled( + self.spider, + self.settings, + proxyauth=proxyauth, + proxyurlcreds="http://foo:@proxy.zyte.com:8011", + ) + self.assertEqual(wascalled, ["is_enabled", "get_proxyauth"]) def test_delay_adjustment(self): delay = 0.5 - slot_key = 'example.com' - url = 'http://example.com' - ban_url = 'http://banned.example' + slot_key = "example.com" + url = "http://example.com" + ban_url = "http://banned.example" self.spider.zyte_smartproxy_enabled = True @@ -296,10 +338,10 @@ def test_delay_adjustment(self): crawler.engine.downloader.slots[slot_key] = slot # ban without retry-after - req = Request(url, meta={'download_slot': slot_key}) + req = Request(url, meta={"download_slot": slot_key}) assert mw.process_request(req, self.spider) is None assert httpproxy.process_request(req, self.spider) is None - headers = {'X-Crawlera-Error': 'banned'} + headers = {"X-Crawlera-Error": "banned"} res = self._mock_zyte_smartproxy_response( ban_url, status=self.bancode, @@ -311,10 +353,7 @@ def test_delay_adjustment(self): # ban with retry-after retry_after = 1.5 - headers = { - 'retry-after': str(retry_after), - 'X-Crawlera-Error': 'banned' - } + headers = {"retry-after": str(retry_after), "X-Crawlera-Error": "banned"} res = self._mock_zyte_smartproxy_response( ban_url, status=self.bancode, @@ -325,43 +364,43 @@ def test_delay_adjustment(self): self.assertEqual(self.spider.download_delay, delay) # DNS cache should be cleared in case of errors - dnscache['proxy.zyte.com'] = '1.1.1.1' + dnscache["proxy.zyte.com"] = "1.1.1.1" res = self._mock_zyte_smartproxy_response(url) mw.process_response(req, res, self.spider) self.assertEqual(slot.delay, delay) self.assertEqual(self.spider.download_delay, delay) - self.assertIn('proxy.zyte.com', dnscache) + self.assertIn("proxy.zyte.com", dnscache) # server failures mw.process_exception(req, ConnectionRefusedError(), self.spider) self.assertEqual(slot.delay, mw.connection_refused_delay) self.assertEqual(self.spider.download_delay, delay) - self.assertNotIn('proxy.zyte.com', dnscache) + self.assertNotIn("proxy.zyte.com", dnscache) - dnscache['proxy.zyte.com'] = '1.1.1.1' + dnscache["proxy.zyte.com"] = "1.1.1.1" res = self._mock_zyte_smartproxy_response(ban_url) mw.process_response(req, res, self.spider) self.assertEqual(slot.delay, delay) self.assertEqual(self.spider.download_delay, delay) - self.assertIn('proxy.zyte.com', dnscache) + self.assertIn("proxy.zyte.com", dnscache) mw.process_exception(req, ConnectionRefusedError(), self.spider) self.assertEqual(slot.delay, mw.connection_refused_delay) self.assertEqual(self.spider.download_delay, delay) - self.assertNotIn('proxy.zyte.com', dnscache) + self.assertNotIn("proxy.zyte.com", dnscache) - dnscache['proxy.zyte.com'] = '1.1.1.1' + dnscache["proxy.zyte.com"] = "1.1.1.1" res = self._mock_zyte_smartproxy_response(ban_url, status=self.bancode) mw.process_response(req, res, self.spider) self.assertEqual(slot.delay, delay) self.assertEqual(self.spider.download_delay, delay) - self.assertIn('proxy.zyte.com', dnscache) + self.assertIn("proxy.zyte.com", dnscache) mw.process_exception(req, ConnectionDone(), self.spider) self.assertEqual(slot.delay, mw.connection_refused_delay) self.assertEqual(self.spider.download_delay, delay) - self.assertNotIn('proxy.zyte.com', dnscache) + self.assertNotIn("proxy.zyte.com", dnscache) def test_process_exception_outside_zyte_smartproxy(self): self.spider.zyte_smartproxy_enabled = False @@ -378,33 +417,33 @@ def test_jobid_header(self): crawler = self._mock_crawler(self.spider, self.settings) mw1 = self.mwcls.from_crawler(crawler) mw1.open_spider(self.spider) - req1 = Request('http://example.com') + req1 = Request("http://example.com") self.assertEqual(mw1.process_request(req1, self.spider), None) - self.assertEqual(req1.headers.get('X-Crawlera-Jobid'), None) - self.assertEqual(req1.headers.get('Zyte-JobId'), None) + self.assertEqual(req1.headers.get("X-Crawlera-Jobid"), None) + self.assertEqual(req1.headers.get("Zyte-JobId"), None) # test with the environment variable 'SCRAPY_JOB' - os.environ['SCRAPY_JOB'] = '2816' + os.environ["SCRAPY_JOB"] = "2816" self.spider.zyte_smartproxy_enabled = True mw2 = self.mwcls.from_crawler(crawler) mw2.open_spider(self.spider) - req2 = Request('http://example.com') + req2 = Request("http://example.com") self.assertEqual(mw2.process_request(req2, self.spider), None) - self.assertEqual(req2.headers.get('X-Crawlera-Jobid'), b'2816') - self.assertEqual(req2.headers.get('Zyte-JobId'), None) + self.assertEqual(req2.headers.get("X-Crawlera-Jobid"), b"2816") + self.assertEqual(req2.headers.get("Zyte-JobId"), None) # Zyte API mw3 = self.mwcls.from_crawler(crawler) mw3.open_spider(self.spider) req3 = Request( - 'http://example.com', + "http://example.com", meta={ "proxy": "http://apikey:@api.zyte.com:8011", }, ) self.assertEqual(mw3.process_request(req3, self.spider), None) - self.assertEqual(req3.headers.get('X-Crawlera-Jobid'), None) - self.assertEqual(req3.headers.get('Zyte-JobId'), b'2816') + self.assertEqual(req3.headers.get("X-Crawlera-Jobid"), None) + self.assertEqual(req3.headers.get("Zyte-JobId"), b"2816") def _test_stats(self, settings, prefix): self.spider.zyte_smartproxy_enabled = True @@ -416,79 +455,101 @@ def _test_stats(self, settings, prefix): mw.open_spider(spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) - req = Request('http://example.com') + req = Request("http://example.com") assert mw.process_request(req, spider) is None assert httpproxy.process_request(req, spider) is None - self.assertEqual(crawler.stats.get_value('{}/request'.format(prefix)), 1) - self.assertEqual(crawler.stats.get_value('{}/request/method/GET'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value("{}/request".format(prefix)), 1) + self.assertEqual( + crawler.stats.get_value("{}/request/method/GET".format(prefix)), 1 + ) res = self._mock_zyte_smartproxy_response(req.url) assert mw.process_response(req, res, spider) is res - self.assertEqual(crawler.stats.get_value('{}/response'.format(prefix)), 1) - self.assertEqual(crawler.stats.get_value('{}/response/status/200'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value("{}/response".format(prefix)), 1) + self.assertEqual( + crawler.stats.get_value("{}/response/status/200".format(prefix)), 1 + ) - req = Request('http://example.com/other', method='POST') + req = Request("http://example.com/other", method="POST") assert mw.process_request(req, spider) is None assert httpproxy.process_request(req, spider) is None - self.assertEqual(crawler.stats.get_value('{}/request'.format(prefix)), 2) - self.assertEqual(crawler.stats.get_value('{}/request/method/POST'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value("{}/request".format(prefix)), 2) + self.assertEqual( + crawler.stats.get_value("{}/request/method/POST".format(prefix)), 1 + ) res = self._mock_zyte_smartproxy_response( - req.url, - status=mw.ban_code, - headers={'Zyte-Error': 'somethingbad'} + req.url, status=mw.ban_code, headers={"Zyte-Error": "somethingbad"} ) assert mw.process_response(req, res, spider) is res - self.assertEqual(crawler.stats.get_value('{}/response'.format(prefix)), 2) - self.assertEqual(crawler.stats.get_value('{}/response/status/{}'.format(prefix, mw.ban_code)), 1) - self.assertEqual(crawler.stats.get_value('{}/response/error'.format(prefix)), 1) - self.assertEqual(crawler.stats.get_value('{}/response/error/somethingbad'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value("{}/response".format(prefix)), 2) + self.assertEqual( + crawler.stats.get_value( + "{}/response/status/{}".format(prefix, mw.ban_code) + ), + 1, + ) + self.assertEqual(crawler.stats.get_value("{}/response/error".format(prefix)), 1) + self.assertEqual( + crawler.stats.get_value("{}/response/error/somethingbad".format(prefix)), 1 + ) self.assertEqual(res.headers["X-Crawlera-Error"], b"somethingbad") self.assertEqual(res.headers["Zyte-Error"], b"somethingbad") res = self._mock_zyte_smartproxy_response( req.url, status=mw.ban_code, - headers={'X-Crawlera-Error': 'banned', "Retry-After": "1"} + headers={"X-Crawlera-Error": "banned", "Retry-After": "1"}, ) assert mw.process_response(req, res, spider) is res - self.assertEqual(crawler.stats.get_value('{}/response'.format(prefix)), 3) - self.assertEqual(crawler.stats.get_value('{}/response/status/{}'.format(prefix, mw.ban_code)), 2) - self.assertEqual(crawler.stats.get_value('{}/response/banned'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value("{}/response".format(prefix)), 3) + self.assertEqual( + crawler.stats.get_value( + "{}/response/status/{}".format(prefix, mw.ban_code) + ), + 2, + ) + self.assertEqual( + crawler.stats.get_value("{}/response/banned".format(prefix)), 1 + ) self.assertEqual(res.headers["X-Crawlera-Error"], b"banned") self.assertEqual(res.headers["Zyte-Error"], b"banned") res = self._mock_zyte_smartproxy_response( req.url, status=mw.ban_code, - headers={'X-Crawlera-Error': 'banned', "Retry-After": "1"} + headers={"X-Crawlera-Error": "banned", "Retry-After": "1"}, ) slot_key = "example.com" crawler.engine.downloader.slots[slot_key] = MockedSlot() req.meta["download_slot"] = "example.com" assert mw.process_response(req, res, spider) is res del req.meta["download_slot"] - self.assertEqual(crawler.stats.get_value('{}/delay/banned'.format(prefix)), 1) - self.assertEqual(crawler.stats.get_value('{}/delay/banned/total'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value("{}/delay/banned".format(prefix)), 1) + self.assertEqual( + crawler.stats.get_value("{}/delay/banned/total".format(prefix)), 1 + ) res = self._mock_zyte_smartproxy_response( req.url, status=407, - headers={'X-Crawlera-Error': 'bad_proxy_auth'}, + headers={"X-Crawlera-Error": "bad_proxy_auth"}, ) assert isinstance(mw.process_response(req, res, spider), Request) - self.assertEqual(crawler.stats.get_value('{}/retries/auth'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value("{}/retries/auth".format(prefix)), 1) res = self._mock_zyte_smartproxy_response( req.url, status=407, - headers={'X-Crawlera-Error': 'bad_proxy_auth'}, + headers={"X-Crawlera-Error": "bad_proxy_auth"}, ) req.meta["zyte_smartproxy_auth_retry_times"] = 11 assert mw.process_response(req, res, spider) is res del req.meta["zyte_smartproxy_auth_retry_times"] - self.assertEqual(crawler.stats.get_value('{}/retries/auth'.format(prefix)), 1) - self.assertEqual(crawler.stats.get_value('{}/retries/auth/max_reached'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value("{}/retries/auth".format(prefix)), 1) + self.assertEqual( + crawler.stats.get_value("{}/retries/auth/max_reached".format(prefix)), 1 + ) res = self._mock_zyte_smartproxy_response( req.url, @@ -497,7 +558,12 @@ def _test_stats(self, settings, prefix): req.meta["dont_proxy"] = True assert isinstance(mw.process_response(req, res, spider), Request) del req.meta["dont_proxy"] - self.assertEqual(crawler.stats.get_value('{}/retries/should_have_been_enabled'.format(prefix)), 1) + self.assertEqual( + crawler.stats.get_value( + "{}/retries/should_have_been_enabled".format(prefix) + ), + 1, + ) def test_stats_spm(self): self._test_stats(self.settings, "zyte_smartproxy") @@ -514,16 +580,16 @@ def _make_fake_request(self, spider, zyte_smartproxy_enabled, **kwargs): mw.open_spider(spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) headers = { - 'X-Crawlera-Debug': True, - 'X-Crawlera-Foo': "foo", - 'X-Crawlera-Profile': 'desktop', - 'User-Agent': 'Scrapy', - '': None, - 'Zyte-Bar': "bar", - 'Zyte-BrowserHtml': True, - 'Zyte-Geolocation': 'foo', + "X-Crawlera-Debug": True, + "X-Crawlera-Foo": "foo", + "X-Crawlera-Profile": "desktop", + "User-Agent": "Scrapy", + "": None, + "Zyte-Bar": "bar", + "Zyte-BrowserHtml": True, + "Zyte-Geolocation": "foo", } - req = Request('http://example.com', headers=headers, **kwargs) + req = Request("http://example.com", headers=headers, **kwargs) mw.process_request(req, spider) httpproxy.process_request(req, spider) return req @@ -531,92 +597,95 @@ def _make_fake_request(self, spider, zyte_smartproxy_enabled, **kwargs): def test_clean_headers_when_disabled(self): req = self._make_fake_request(self.spider, zyte_smartproxy_enabled=False) - self.assertNotIn(b'X-Crawlera-Debug', req.headers) - self.assertNotIn(b'X-Crawlera-Foo', req.headers) - self.assertNotIn(b'X-Crawlera-Profile', req.headers) - self.assertNotIn(b'Zyte-Bar', req.headers) - self.assertNotIn(b'Zyte-BrowserHtml', req.headers) - self.assertNotIn(b'Zyte-Geolocation', req.headers) - self.assertIn(b'User-Agent', req.headers) + self.assertNotIn(b"X-Crawlera-Debug", req.headers) + self.assertNotIn(b"X-Crawlera-Foo", req.headers) + self.assertNotIn(b"X-Crawlera-Profile", req.headers) + self.assertNotIn(b"Zyte-Bar", req.headers) + self.assertNotIn(b"Zyte-BrowserHtml", req.headers) + self.assertNotIn(b"Zyte-Geolocation", req.headers) + self.assertIn(b"User-Agent", req.headers) def test_clean_headers_when_enabled_spm(self): req = self._make_fake_request(self.spider, zyte_smartproxy_enabled=True) - self.assertEqual(req.headers[b'X-Crawlera-Debug'], b'True') - self.assertEqual(req.headers[b'X-Crawlera-Foo'], b'foo') - self.assertEqual(req.headers[b'X-Crawlera-Profile'], b'desktop') - self.assertNotIn(b'Zyte-Bar', req.headers) - self.assertNotIn(b'Zyte-BrowserHtml', req.headers) - self.assertNotIn(b'Zyte-Geolocation', req.headers) - self.assertEqual(req.headers[b'X-Crawlera-Region'], b'foo') - self.assertIn(b'User-Agent', req.headers) + self.assertEqual(req.headers[b"X-Crawlera-Debug"], b"True") + self.assertEqual(req.headers[b"X-Crawlera-Foo"], b"foo") + self.assertEqual(req.headers[b"X-Crawlera-Profile"], b"desktop") + self.assertNotIn(b"Zyte-Bar", req.headers) + self.assertNotIn(b"Zyte-BrowserHtml", req.headers) + self.assertNotIn(b"Zyte-Geolocation", req.headers) + self.assertEqual(req.headers[b"X-Crawlera-Region"], b"foo") + self.assertIn(b"User-Agent", req.headers) def test_clean_headers_when_enabled_zyte_api(self): meta = {"proxy": "http://apikey:@api.zyte.com:8011"} - req = self._make_fake_request(self.spider, zyte_smartproxy_enabled=True, meta=meta) - self.assertNotIn(b'X-Crawlera-Debug', req.headers) - self.assertNotIn(b'X-Crawlera-Foo', req.headers) - self.assertNotIn(b'X-Crawlera-Profile', req.headers) - self.assertEqual(req.headers[b'Zyte-Bar'], b'bar') - self.assertEqual(req.headers[b'Zyte-BrowserHtml'], b'True') - self.assertEqual(req.headers[b'Zyte-Device'], b'desktop') - self.assertEqual(req.headers[b'Zyte-Geolocation'], b'foo') - self.assertIn(b'User-Agent', req.headers) + req = self._make_fake_request( + self.spider, zyte_smartproxy_enabled=True, meta=meta + ) + self.assertNotIn(b"X-Crawlera-Debug", req.headers) + self.assertNotIn(b"X-Crawlera-Foo", req.headers) + self.assertNotIn(b"X-Crawlera-Profile", req.headers) + self.assertEqual(req.headers[b"Zyte-Bar"], b"bar") + self.assertEqual(req.headers[b"Zyte-BrowserHtml"], b"True") + self.assertEqual(req.headers[b"Zyte-Device"], b"desktop") + self.assertEqual(req.headers[b"Zyte-Geolocation"], b"foo") + self.assertIn(b"User-Agent", req.headers) def test_zyte_smartproxy_default_headers(self): spider = self.spider self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_DEFAULT_HEADERS'] = { - 'X-Crawlera-Profile': 'desktop', + self.settings["ZYTE_SMARTPROXY_DEFAULT_HEADERS"] = { + "X-Crawlera-Profile": "desktop", } crawler = self._mock_crawler(spider, self.settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) - req = Request('http://example.com/other') + req = Request("http://example.com/other") assert mw.process_request(req, spider) is None - self.assertEqual(req.headers['X-Crawlera-Profile'], b'desktop') - self.assertNotIn('Zyte-Device', req.headers) + self.assertEqual(req.headers["X-Crawlera-Profile"], b"desktop") + self.assertNotIn("Zyte-Device", req.headers) # Header translation req = Request( - 'http://example.com/other', + "http://example.com/other", meta={"proxy": "http://apikey:@api.zyte.com:8011"}, ) assert mw.process_request(req, spider) is None - self.assertNotIn('X-Crawlera-Profile', req.headers) - self.assertEqual(req.headers['Zyte-Device'], b'desktop') + self.assertNotIn("X-Crawlera-Profile", req.headers) + self.assertEqual(req.headers["Zyte-Device"], b"desktop") # test ignore None headers - self.settings['ZYTE_SMARTPROXY_DEFAULT_HEADERS'] = { - 'X-Crawlera-Profile': None, - 'X-Crawlera-Cookies': 'disable', + self.settings["ZYTE_SMARTPROXY_DEFAULT_HEADERS"] = { + "X-Crawlera-Profile": None, + "X-Crawlera-Cookies": "disable", } crawler = self._mock_crawler(spider, self.settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) - req = Request('http://example.com/other') + req = Request("http://example.com/other") assert mw.process_request(req, spider) is None - self.assertEqual(req.headers['X-Crawlera-Cookies'], b'disable') - self.assertNotIn('X-Crawlera-Profile', req.headers) - - @patch('scrapy_zyte_smartproxy.middleware.warnings') - @patch('scrapy_zyte_smartproxy.middleware.logger') - def test_zyte_smartproxy_default_headers_conflicting_headers(self, mock_logger, mock_warnings): + self.assertEqual(req.headers["X-Crawlera-Cookies"], b"disable") + self.assertNotIn("X-Crawlera-Profile", req.headers) + + @patch("scrapy_zyte_smartproxy.middleware.warnings") + @patch("scrapy_zyte_smartproxy.middleware.logger") + def test_zyte_smartproxy_default_headers_conflicting_headers( + self, mock_logger, mock_warnings + ): spider = self.spider self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_DEFAULT_HEADERS'] = { - 'X-Crawlera-Profile': 'desktop', + self.settings["ZYTE_SMARTPROXY_DEFAULT_HEADERS"] = { + "X-Crawlera-Profile": "desktop", } crawler = self._mock_crawler(spider, self.settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) - req = Request('http://example.com/other', - headers={'X-Crawlera-UA': 'desktop'}) + req = Request("http://example.com/other", headers={"X-Crawlera-UA": "desktop"}) assert mw.process_request(req, spider) is None - self.assertEqual(req.headers['X-Crawlera-UA'], b'desktop') - self.assertEqual(req.headers['X-Crawlera-Profile'], b'desktop') + self.assertEqual(req.headers["X-Crawlera-UA"], b"desktop") + self.assertEqual(req.headers["X-Crawlera-Profile"], b"desktop") some_requests_warning = ( "The headers ('X-Crawlera-Profile', 'X-Crawlera-UA') are " "conflicting on some of your requests. Please check " @@ -633,20 +702,17 @@ def test_zyte_smartproxy_default_headers_conflicting_headers(self, mock_logger, "for more information" ) mock_logger.debug.assert_called_with( - other_request_warning, - extra={'spider': spider} + other_request_warning, extra={"spider": spider} ) # test it ignores case - req = Request('http://example.com/other', - headers={'x-crawlera-ua': 'desktop'}) + req = Request("http://example.com/other", headers={"x-crawlera-ua": "desktop"}) assert mw.process_request(req, spider) is None - self.assertEqual(req.headers['X-Crawlera-UA'], b'desktop') - self.assertEqual(req.headers['X-Crawlera-Profile'], b'desktop') + self.assertEqual(req.headers["X-Crawlera-UA"], b"desktop") + self.assertEqual(req.headers["X-Crawlera-Profile"], b"desktop") mock_warnings.warn.assert_called_with(some_requests_warning) mock_logger.debug.assert_called_with( - other_request_warning, - extra={'spider': spider} + other_request_warning, extra={"spider": spider} ) def test_dont_proxy_false_does_nothing(self): @@ -655,10 +721,10 @@ def test_dont_proxy_false_does_nothing(self): crawler = self._mock_crawler(spider, self.settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) - req = Request('http://example.com/other') - req.meta['dont_proxy'] = False + req = Request("http://example.com/other") + req.meta["dont_proxy"] = False assert mw.process_request(req, spider) is None - self.assertIsNotNone(req.meta.get('proxy')) + self.assertIsNotNone(req.meta.get("proxy")) def test_is_banned(self): self.spider.zyte_smartproxy_enabled = True @@ -670,37 +736,47 @@ def test_is_banned(self): res = Response(req.url, status=200) res = mw.process_response(req, res, self.spider) self.assertFalse(mw._is_banned(res)) - res = Response(req.url, status=503, headers={'X-Crawlera-Error': 'noslaves'}) + res = Response(req.url, status=503, headers={"X-Crawlera-Error": "noslaves"}) res = mw.process_response(req, res, self.spider) self.assertFalse(mw._is_banned(res)) - res = Response(req.url, status=503, headers={'Zyte-Error': '/limits/over-global-limit'}) + res = Response( + req.url, + status=503, + headers={"Zyte-Error": "/limits/over-global-limit"}, + ) res = mw.process_response(req, res, self.spider) self.assertFalse(mw._is_banned(res)) - res = Response(req.url, status=503, headers={'X-Crawlera-Error': 'banned'}) + res = Response(req.url, status=503, headers={"X-Crawlera-Error": "banned"}) res = mw.process_response(req, res, self.spider) self.assertTrue(mw._is_banned(res)) - res = Response(req.url, status=520, headers={'Zyte-Error': '/download/temporary-error'}) + res = Response( + req.url, status=520, headers={"Zyte-Error": "/download/temporary-error"} + ) res = mw.process_response(req, res, self.spider) self.assertTrue(mw._is_banned(res)) - res = Response(req.url, status=521, headers={'Zyte-Error': '/download/internal-error'}) + res = Response( + req.url, + status=521, + headers={"Zyte-Error": "/download/internal-error"}, + ) res = mw.process_response(req, res, self.spider) self.assertTrue(mw._is_banned(res)) - @patch('random.uniform') + @patch("random.uniform") def test_noslaves_delays(self, random_uniform_patch): # mock random.uniform to just return the max delay random_uniform_patch.side_effect = lambda x, y: y - slot_key = 'example.com' - url = 'http://example.com' - ban_url = 'http://banned.example' + slot_key = "example.com" + url = "http://example.com" + ban_url = "http://banned.example" max_delay = 70 backoff_step = 15 default_delay = 0 - self.settings['ZYTE_SMARTPROXY_BACKOFF_STEP'] = backoff_step - self.settings['ZYTE_SMARTPROXY_BACKOFF_MAX'] = max_delay + self.settings["ZYTE_SMARTPROXY_BACKOFF_STEP"] = backoff_step + self.settings["ZYTE_SMARTPROXY_BACKOFF_MAX"] = max_delay self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) @@ -711,7 +787,7 @@ def test_noslaves_delays(self, random_uniform_patch): slot = MockedSlot() crawler.engine.downloader.slots[slot_key] = slot - noslaves_req = Request(url, meta={'download_slot': slot_key}) + noslaves_req = Request(url, meta={"download_slot": slot_key}) assert mw.process_request(noslaves_req, self.spider) is None assert httpproxy.process_request(noslaves_req, self.spider) is None @@ -719,7 +795,7 @@ def test_noslaves_delays(self, random_uniform_patch): noslaves_response = self._mock_zyte_smartproxy_response( ban_url, status=503, - headers={'X-Crawlera-Error': 'noslaves'}, + headers={"X-Crawlera-Error": "noslaves"}, ) mw.process_response(noslaves_req, noslaves_response, self.spider) self.assertEqual(slot.delay, backoff_step) @@ -727,32 +803,32 @@ def test_noslaves_delays(self, random_uniform_patch): over_use_limit_response = self._mock_zyte_smartproxy_response( ban_url, status=429, - headers={'Zyte-Error': '/limits/over-user-limit'}, + headers={"Zyte-Error": "/limits/over-user-limit"}, ) mw.process_response(noslaves_req, over_use_limit_response, self.spider) - self.assertEqual(slot.delay, backoff_step * 2 ** 1) + self.assertEqual(slot.delay, backoff_step * 2**1) over_domain_limit_response = self._mock_zyte_smartproxy_response( ban_url, status=429, - headers={'Zyte-Error': '/limits/over-domain-limit'}, + headers={"Zyte-Error": "/limits/over-domain-limit"}, ) mw.process_response(noslaves_req, over_domain_limit_response, self.spider) - self.assertEqual(slot.delay, backoff_step * 2 ** 2) + self.assertEqual(slot.delay, backoff_step * 2**2) over_global_limit_response = self._mock_zyte_smartproxy_response( ban_url, status=503, - headers={'Zyte-Error': '/limits/over-global-limit'}, + headers={"Zyte-Error": "/limits/over-global-limit"}, ) mw.process_response(noslaves_req, over_global_limit_response, self.spider) self.assertEqual(slot.delay, max_delay) # other responses reset delay - ban_req = Request(url, meta={'download_slot': slot_key}) + ban_req = Request(url, meta={"download_slot": slot_key}) assert mw.process_request(ban_req, self.spider) is None assert httpproxy.process_request(ban_req, self.spider) is None - ban_headers = {'X-Crawlera-Error': 'banned'} + ban_headers = {"X-Crawlera-Error": "banned"} ban_res = self._mock_zyte_smartproxy_response( ban_url, status=self.bancode, @@ -764,7 +840,7 @@ def test_noslaves_delays(self, random_uniform_patch): mw.process_response(noslaves_req, noslaves_response, self.spider) self.assertEqual(slot.delay, backoff_step) - good_req = Request(url, meta={'download_slot': slot_key}) + good_req = Request(url, meta={"download_slot": slot_key}) assert mw.process_request(good_req, self.spider) is None assert httpproxy.process_request(good_req, self.spider) is None good_res = self._mock_zyte_smartproxy_response( @@ -774,20 +850,19 @@ def test_noslaves_delays(self, random_uniform_patch): mw.process_response(good_req, good_res, self.spider) self.assertEqual(slot.delay, default_delay) - @patch('random.uniform') + @patch("random.uniform") def test_auth_error_retries(self, random_uniform_patch): # mock random.uniform to just return the max delay random_uniform_patch.side_effect = lambda x, y: y - slot_key = 'example.com' - url = 'http://example.com' - ban_url = 'http://auth.error' + slot_key = "example.com" + url = "http://example.com" + ban_url = "http://auth.error" max_delay = 70 backoff_step = 15 - default_delay = 0 - self.settings['ZYTE_SMARTPROXY_BACKOFF_STEP'] = backoff_step - self.settings['ZYTE_SMARTPROXY_BACKOFF_MAX'] = max_delay + self.settings["ZYTE_SMARTPROXY_BACKOFF_STEP"] = backoff_step + self.settings["ZYTE_SMARTPROXY_BACKOFF_MAX"] = max_delay self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) @@ -799,14 +874,12 @@ def test_auth_error_retries(self, random_uniform_patch): slot = MockedSlot() crawler.engine.downloader.slots[slot_key] = slot - auth_error_req = Request(url, meta={'download_slot': slot_key}) + auth_error_req = Request(url, meta={"download_slot": slot_key}) assert mw.process_request(auth_error_req, self.spider) is None assert httpproxy.process_request(auth_error_req, self.spider) is None - auth_error_headers = {'X-Crawlera-Error': 'bad_proxy_auth'} + auth_error_headers = {"X-Crawlera-Error": "bad_proxy_auth"} auth_error_response = self._mock_zyte_smartproxy_response( - ban_url, - status=self.auth_error_code, - headers=auth_error_headers + ban_url, status=self.auth_error_code, headers=auth_error_headers ) # delays grow exponentially, retry times increase accordingly @@ -817,13 +890,13 @@ def test_auth_error_retries(self, random_uniform_patch): auth_error_req.meta["zyte_smartproxy_auth_retry_times"] = retry_times req = mw.process_response(auth_error_req, auth_error_response, self.spider) - self.assertEqual(slot.delay, backoff_step * 2 ** 1) + self.assertEqual(slot.delay, backoff_step * 2**1) retry_times = req.meta["zyte_smartproxy_auth_retry_times"] self.assertEqual(retry_times, 2) auth_error_req.meta["zyte_smartproxy_auth_retry_times"] = retry_times req = mw.process_response(auth_error_req, auth_error_response, self.spider) - self.assertEqual(slot.delay, backoff_step * 2 ** 2) + self.assertEqual(slot.delay, backoff_step * 2**2) retry_times = req.meta["zyte_smartproxy_auth_retry_times"] self.assertEqual(retry_times, 3) @@ -844,10 +917,12 @@ def test_auth_error_retries(self, random_uniform_patch): ban_url, status=self.auth_error_code, ) - res = mw.process_response(auth_error_req, non_zyte_smartproxy_407_response, self.spider) + res = mw.process_response( + auth_error_req, non_zyte_smartproxy_407_response, self.spider + ) self.assertIsInstance(res, Response) - @patch('scrapy_zyte_smartproxy.middleware.logger') + @patch("scrapy_zyte_smartproxy.middleware.logger") def test_open_spider_logging(self, mock_logger): spider = self.spider self.spider.zyte_smartproxy_enabled = True @@ -856,10 +931,9 @@ def test_open_spider_logging(self, mock_logger): mw.open_spider(spider) expected_calls = [ call( - "Using Zyte proxy service %s with an API key ending in %s" % ( - self.mwcls.url, 'apikey' - ), - extra={'spider': spider}, + "Using Zyte proxy service %s with an API key ending in %s" + % (self.mwcls.url, "apikey"), + extra={"spider": spider}, ), call( "ZyteSmartProxyMiddleware: disabling download delays in " @@ -867,7 +941,7 @@ def test_open_spider_logging(self, mock_logger): "To avoid this behaviour you can use the " "ZYTE_SMARTPROXY_PRESERVE_DELAY setting, but keep in mind " "that this may slow down the crawl significantly", - extra={'spider': spider}, + extra={"spider": spider}, ), ] assert mock_logger.info.call_args_list == expected_calls @@ -876,7 +950,7 @@ def test_process_response_enables_zyte_smartproxy(self): url = "https://scrapy.org" self.spider.zyte_smartproxy_enabled = False - self.settings['ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES'] = [403] + self.settings["ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES"] = [403] crawler = self._mock_crawler(self.spider, self.settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(self.spider) @@ -898,9 +972,12 @@ def test_process_response_enables_zyte_smartproxy(self): self.assertIsInstance(out, Request) self.assertEqual(mw.enabled, False) self.assertEqual(mw.enabled_for_domain["scrapy.org"], True) - self.assertEqual(mw.crawler.stats.get_stats(), { - 'zyte_smartproxy/retries/should_have_been_enabled': 1, - }) + self.assertEqual( + mw.crawler.stats.get_stats(), + { + "zyte_smartproxy/retries/should_have_been_enabled": 1, + }, + ) # Another regular response with bad code should be done on Zyte Smart # Proxy Manager and not be retried @@ -926,7 +1003,7 @@ def test_process_response_from_file_scheme(self): url = "file:///tmp/foobar.txt" self.spider.zyte_smartproxy_enabled = False - self.settings['ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES'] = [403] + self.settings["ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES"] = [403] crawler = self._mock_crawler(self.spider, self.settings) mw = self.mwcls.from_crawler(crawler) mw.enabled_for_domain = {} @@ -943,7 +1020,7 @@ def test_process_response_from_file_scheme(self): self.assertEqual(mw.crawler.stats.get_stats(), {}) self.assertEqual(out.status, 200) - @patch('scrapy_zyte_smartproxy.middleware.logger') + @patch("scrapy_zyte_smartproxy.middleware.logger") def test_apikey_warning_zyte_smartproxy_disabled(self, mock_logger): self.spider.zyte_smartproxy_enabled = False settings = {} @@ -953,7 +1030,7 @@ def test_apikey_warning_zyte_smartproxy_disabled(self, mock_logger): self.assertFalse(mw.enabled) mock_logger.warning.assert_not_called() - @patch('scrapy_zyte_smartproxy.middleware.logger') + @patch("scrapy_zyte_smartproxy.middleware.logger") def test_no_apikey_warning_zyte_smartproxy_enabled(self, mock_logger): self.spider.zyte_smartproxy_enabled = True settings = {} @@ -963,28 +1040,28 @@ def test_no_apikey_warning_zyte_smartproxy_enabled(self, mock_logger): self.assertTrue(mw.enabled) mock_logger.warning.assert_called_with( "Zyte proxy services cannot be used without an API key", - extra={'spider': self.spider} + extra={"spider": self.spider}, ) - @patch('scrapy_zyte_smartproxy.middleware.logger') + @patch("scrapy_zyte_smartproxy.middleware.logger") def test_no_apikey_warning_force_enable(self, mock_logger): self.spider.zyte_smartproxy_enabled = False - settings = {'ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES': [403]} + settings = {"ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES": [403]} crawler = self._mock_crawler(self.spider, settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(self.spider) self.assertFalse(mw.enabled) mock_logger.warning.assert_called_with( "Zyte proxy services cannot be used without an API key", - extra={'spider': self.spider} + extra={"spider": self.spider}, ) - @patch('scrapy_zyte_smartproxy.middleware.logger') + @patch("scrapy_zyte_smartproxy.middleware.logger") def test_apikey_warning_force_enable(self, mock_logger): self.spider.zyte_smartproxy_enabled = False settings = { - 'ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES': [403], - 'ZYTE_SMARTPROXY_APIKEY': 'apikey' + "ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES": [403], + "ZYTE_SMARTPROXY_APIKEY": "apikey", } crawler = self._mock_crawler(self.spider, settings) mw = self.mwcls.from_crawler(crawler) @@ -992,24 +1069,20 @@ def test_apikey_warning_force_enable(self, mock_logger): self.assertFalse(mw.enabled) mock_logger.warning.assert_not_called() - def test_is_enabled_warnings(self): self._assert_disabled(self.spider, self.settings) - self.settings['HUBPROXY_ENABLED'] = True + self.settings["HUBPROXY_ENABLED"] = True with pytest.warns(ScrapyDeprecationWarning) as record: self._assert_enabled(self.spider, self.settings) assert len(record) == 1 - assert 'HUBPROXY_ENABLED setting is deprecated' in \ - str(record[0].message) + assert "HUBPROXY_ENABLED setting is deprecated" in str(record[0].message) - del self.settings['HUBPROXY_ENABLED'] + del self.settings["HUBPROXY_ENABLED"] self.spider.use_hubproxy = False with pytest.warns(ScrapyDeprecationWarning) as record: self._assert_disabled(self.spider, self.settings) assert len(record) == 1 - assert 'use_hubproxy attribute is deprecated' in \ - str(record[0].message) - + assert "use_hubproxy attribute is deprecated" in str(record[0].message) def test_settings_warnings(self): self.spider.hubproxy_maxbans = 10 @@ -1018,23 +1091,22 @@ def test_settings_warnings(self): with pytest.warns(ScrapyDeprecationWarning) as record: mw.open_spider(self.spider) assert len(record) == 1 - assert 'hubproxy_maxbans attribute is deprecated' in \ - str(record[0].message) + assert "hubproxy_maxbans attribute is deprecated" in str(record[0].message) del self.spider.hubproxy_maxbans - self.settings['HUBPROXY_BACKOFF_MAX'] = 10 + self.settings["HUBPROXY_BACKOFF_MAX"] = 10 crawler = self._mock_crawler(self.spider, self.settings) mw = self.mwcls.from_crawler(crawler) with pytest.warns(ScrapyDeprecationWarning) as record: mw.open_spider(self.spider) assert len(record) == 1 - assert 'HUBPROXY_BACKOFF_MAX setting is deprecated' in \ - str(record[0].message) - + assert "HUBPROXY_BACKOFF_MAX setting is deprecated" in str( + record[0].message + ) def test_no_slot(self): - url = 'http://example.com' - ban_url = 'http://banned.example' + url = "http://example.com" + ban_url = "http://banned.example" self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) @@ -1042,11 +1114,10 @@ def test_no_slot(self): mw.open_spider(self.spider) # there are no slot named 'example.com' - noslaves_req = Request(url, - meta={'download_slot': 'example.com'}) + noslaves_req = Request(url, meta={"download_slot": "example.com"}) assert mw.process_request(noslaves_req, self.spider) is None - headers = {'X-Crawlera-Error': 'noslaves'} + headers = {"X-Crawlera-Error": "noslaves"} noslaves_res = self._mock_zyte_smartproxy_response( ban_url, status=self.bancode, @@ -1056,45 +1127,42 @@ def test_no_slot(self): response = mw.process_response(noslaves_req, noslaves_res, self.spider) assert response.status == 503 - def test_settings_dict(self): self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_DEFAULT_HEADERS'] = { - 'X-Crawlera-Profile': 'desktop', + self.settings["ZYTE_SMARTPROXY_DEFAULT_HEADERS"] = { + "X-Crawlera-Profile": "desktop", } crawler = self._mock_crawler(self.spider, self.settings) mw = self.mwcls.from_crawler(crawler) # we don't have a dict settings yet, have to mess with protected # property - mw._settings.append( - ('default_headers', dict) - ) + mw._settings.append(("default_headers", dict)) mw.open_spider(self.spider) - req = Request('http://example.com/other') + req = Request("http://example.com/other") mw.process_request(req, self.spider) assert mw.process_request(req, self.spider) is None - self.assertEqual(req.headers['X-Crawlera-Profile'], b'desktop') + self.assertEqual(req.headers["X-Crawlera-Profile"], b"desktop") def test_client_header(self): self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(self.spider) - req1 = Request('http://example.com') + req1 = Request("http://example.com") self.assertEqual(mw.process_request(req1, self.spider), None) - client = 'scrapy-zyte-smartproxy/{}'.format(__version__).encode() - self.assertEqual(req1.headers.get('X-Crawlera-Client'), client) - self.assertEqual(req1.headers.get('Zyte-Client'), None) + client = "scrapy-zyte-smartproxy/{}".format(__version__).encode() + self.assertEqual(req1.headers.get("X-Crawlera-Client"), client) + self.assertEqual(req1.headers.get("Zyte-Client"), None) req2 = Request( - 'http://example.com', + "http://example.com", meta={ "proxy": "http://apikey:@api.zyte.com:8011", }, ) self.assertEqual(mw.process_request(req2, self.spider), None) - self.assertEqual(req2.headers.get('X-Crawlera-Client'), None) - self.assertEqual(req2.headers.get('Zyte-Client'), client) + self.assertEqual(req2.headers.get("X-Crawlera-Client"), None) + self.assertEqual(req2.headers.get("Zyte-Client"), client) def test_scrapy_httpproxy_integration(self): self.spider.zyte_smartproxy_enabled = True @@ -1102,26 +1170,26 @@ def test_scrapy_httpproxy_integration(self): smartproxy = self.mwcls.from_crawler(crawler) smartproxy.open_spider(self.spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) - request = Request('https://example.com') - auth_header = basic_auth_header('apikey', '') + request = Request("https://example.com") + auth_header = basic_auth_header("apikey", "") # 1st pass self.assertEqual(smartproxy.process_request(request, self.spider), None) self.assertEqual(httpproxy.process_request(request, self.spider), None) - self.assertEqual(request.meta['proxy'], 'http://proxy.zyte.com:8011') - self.assertEqual(request.headers[b'Proxy-Authorization'], auth_header) + self.assertEqual(request.meta["proxy"], "http://proxy.zyte.com:8011") + self.assertEqual(request.headers[b"Proxy-Authorization"], auth_header) # 2nd pass (e.g. retry or redirect) self.assertEqual(smartproxy.process_request(request, self.spider), None) self.assertEqual(httpproxy.process_request(request, self.spider), None) - self.assertEqual(request.meta['proxy'], 'http://proxy.zyte.com:8011') - self.assertEqual(request.headers[b'Proxy-Authorization'], auth_header) + self.assertEqual(request.meta["proxy"], "http://proxy.zyte.com:8011") + self.assertEqual(request.headers[b"Proxy-Authorization"], auth_header) def test_subclass_non_basic_header(self): class Subclass(self.mwcls): def get_proxyauth(self, spider): - return b'Non-Basic foo' + return b"Non-Basic foo" self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) @@ -1133,7 +1201,7 @@ def test_subclass_basic_header_non_base64(self): class Subclass(self.mwcls): def get_proxyauth(self, spider): - return b'Basic foo' + return b"Basic foo" self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) @@ -1145,7 +1213,7 @@ def test_subclass_basic_header_nonurlsafe_base64(self): class Subclass(self.mwcls): def get_proxyauth(self, spider): - return b'Basic YWF+Og==' + return b"Basic YWF+Og==" self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) @@ -1157,7 +1225,7 @@ def test_subclass_basic_header_urlsafe_base64(self): class Subclass(self.mwcls): def get_proxyauth(self, spider): - return b'Basic YWF-Og==' + return b"Basic YWF-Og==" self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) @@ -1187,7 +1255,9 @@ def test_header_translation(self): self.assertNotIn(header, request.headers) self.assertEqual(request.headers[translation], value) - spm_to_zyte_api_translations = {v: k for k, v in zyte_api_to_spm_translations.items()} + spm_to_zyte_api_translations = { + v: k for k, v in zyte_api_to_spm_translations.items() + } for header, translation in spm_to_zyte_api_translations.items(): request = Request( "https://example.com", @@ -1198,7 +1268,7 @@ def test_header_translation(self): self.assertNotIn(header, request.headers) self.assertEqual(request.headers[translation], value) - @patch('scrapy_zyte_smartproxy.middleware.logger') + @patch("scrapy_zyte_smartproxy.middleware.logger") def test_header_drop_warnings(self, mock_logger): self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) @@ -1245,7 +1315,8 @@ def test_header_drop_warnings(self, mock_logger): "request is proxied with %s and not with %s, and " "automatic translation is not supported for this " "header. See " - "https://docs.zyte.com/zyte-api/migration/zyte/smartproxy.html#parameter-mapping" + "https://docs.zyte.com/zyte-api/migration/zyte/smartproxy.html" + "#parameter-mapping" " to learn the right way to translate this header " "manually." ), @@ -1269,7 +1340,8 @@ def test_header_drop_warnings(self, mock_logger): "request is proxied with %s and not with %s, and " "automatic translation is not supported for this " "header. See " - "https://docs.zyte.com/zyte-api/migration/zyte/smartproxy.html#parameter-mapping" + "https://docs.zyte.com/zyte-api/migration/zyte/smartproxy.html" + "#parameter-mapping" " to learn the right way to translate this header " "manually." ), @@ -1289,7 +1361,8 @@ def test_header_drop_warnings(self, mock_logger): headers={"Zyte-Foo": "bar", "X-Crawlera-Foo": "bar"}, ) self.assertEqual(mw.process_request(request, self.spider), None) - mock_logger.warning.assert_not_called() # No warnings for "drop all" scenarios + # No warnings for "drop all" scenarios + mock_logger.warning.assert_not_called() def test_header_based_handling(self): self.spider.zyte_smartproxy_enabled = True @@ -1299,21 +1372,20 @@ def test_header_based_handling(self): mw.open_spider(spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) - req = Request('http://example.com') + req = Request("http://example.com") assert mw.process_request(req, spider) is None assert httpproxy.process_request(req, spider) is None count = 0 res = Response(req.url) assert mw.process_response(req, res, spider) is res - self.assertEqual(crawler.stats.get_value('zyte_smartproxy/response'), None) + self.assertEqual(crawler.stats.get_value("zyte_smartproxy/response"), None) for k, v in RESPONSE_IDENTIFYING_HEADERS: count += 1 res = Response(req.url, headers={k: v}) assert mw.process_response(req, res, spider) is res - self.assertEqual(crawler.stats.get_value('zyte_smartproxy/response'), count) - + self.assertEqual(crawler.stats.get_value("zyte_smartproxy/response"), count) def test_meta_copy(self): """Warn when users copy the proxy key from one response to the next.""" @@ -1322,20 +1394,20 @@ def test_meta_copy(self): smartproxy = self.mwcls.from_crawler(crawler) smartproxy.open_spider(self.spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) - auth_header = basic_auth_header('apikey', '') + auth_header = basic_auth_header("apikey", "") - request1 = Request('https://example.com/a') + request1 = Request("https://example.com/a") self.assertEqual(smartproxy.process_request(request1, self.spider), None) self.assertEqual(httpproxy.process_request(request1, self.spider), None) - self.assertEqual(request1.meta['proxy'], 'http://proxy.zyte.com:8011') - self.assertEqual(request1.headers[b'Proxy-Authorization'], auth_header) + self.assertEqual(request1.meta["proxy"], "http://proxy.zyte.com:8011") + self.assertEqual(request1.headers[b"Proxy-Authorization"], auth_header) - request2 = Request('https://example.com/b', meta=dict(request1.meta)) - with patch('scrapy_zyte_smartproxy.middleware.logger') as logger: + request2 = Request("https://example.com/b", meta=dict(request1.meta)) + with patch("scrapy_zyte_smartproxy.middleware.logger") as logger: self.assertEqual(smartproxy.process_request(request2, self.spider), None) self.assertEqual(httpproxy.process_request(request2, self.spider), None) - self.assertEqual(request2.meta['proxy'], 'http://proxy.zyte.com:8011') - self.assertEqual(request2.headers[b'Proxy-Authorization'], auth_header) + self.assertEqual(request2.meta["proxy"], "http://proxy.zyte.com:8011") + self.assertEqual(request2.headers[b"Proxy-Authorization"], auth_header) expected_calls = [ call( "The value of the 'proxy' meta key of request {request2} " @@ -1356,14 +1428,14 @@ def test_manual_proxy_same(self): smartproxy = self.mwcls.from_crawler(crawler) smartproxy.open_spider(self.spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) - auth_header = basic_auth_header('apikey', '') + auth_header = basic_auth_header("apikey", "") - meta = {'proxy': 'http://apikey:@proxy.zyte.com:8011'} - request = Request('https://example.com', meta=meta) + meta = {"proxy": "http://apikey:@proxy.zyte.com:8011"} + request = Request("https://example.com", meta=meta) self.assertEqual(smartproxy.process_request(request, self.spider), None) self.assertEqual(httpproxy.process_request(request, self.spider), None) - self.assertEqual(request.meta['proxy'], 'http://proxy.zyte.com:8011') - self.assertEqual(request.headers[b'Proxy-Authorization'], auth_header) + self.assertEqual(request.meta["proxy"], "http://proxy.zyte.com:8011") + self.assertEqual(request.headers[b"Proxy-Authorization"], auth_header) def test_manual_proxy_without_api_key(self): """Defining the 'proxy' request meta key with the right URL but missing @@ -1373,15 +1445,15 @@ def test_manual_proxy_without_api_key(self): smartproxy = self.mwcls.from_crawler(crawler) smartproxy.open_spider(self.spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) - auth_header = basic_auth_header('apikey', '') + auth_header = basic_auth_header("apikey", "") - meta = {'proxy': 'http://proxy.zyte.com:8011'} - request = Request('https://example.com', meta=meta) - with patch('scrapy_zyte_smartproxy.middleware.logger') as logger: + meta = {"proxy": "http://proxy.zyte.com:8011"} + request = Request("https://example.com", meta=meta) + with patch("scrapy_zyte_smartproxy.middleware.logger") as logger: self.assertEqual(smartproxy.process_request(request, self.spider), None) self.assertEqual(httpproxy.process_request(request, self.spider), None) - self.assertEqual(request.meta['proxy'], 'http://proxy.zyte.com:8011') - self.assertEqual(request.headers[b'Proxy-Authorization'], auth_header) + self.assertEqual(request.meta["proxy"], "http://proxy.zyte.com:8011") + self.assertEqual(request.headers[b"Proxy-Authorization"], auth_header) expected_calls = [ call( "The value of the 'proxy' meta key of request {request} " @@ -1403,12 +1475,12 @@ def test_manual_proxy_different(self): smartproxy.open_spider(self.spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) - meta = {'proxy': 'http://proxy.example.com:8011'} - request = Request('https://example.com', meta=meta) + meta = {"proxy": "http://proxy.example.com:8011"} + request = Request("https://example.com", meta=meta) self.assertEqual(smartproxy.process_request(request, self.spider), None) self.assertEqual(httpproxy.process_request(request, self.spider), None) - self.assertEqual(request.meta['proxy'], 'http://proxy.example.com:8011') - self.assertNotIn(b'Proxy-Authorization', request.headers) + self.assertEqual(request.meta["proxy"], "http://proxy.example.com:8011") + self.assertNotIn(b"Proxy-Authorization", request.headers) def test_manual_proxy_different_auth(self): """Setting a custom 'proxy' request meta with a matching proxy URL @@ -1420,9 +1492,9 @@ def test_manual_proxy_different_auth(self): httpproxy = HttpProxyMiddleware.from_crawler(crawler) auth_header = basic_auth_header("altkey", "") - meta = {'proxy': 'http://altkey:@proxy.example.com:8011'} - request = Request('https://example.com', meta=meta) + meta = {"proxy": "http://altkey:@proxy.example.com:8011"} + request = Request("https://example.com", meta=meta) self.assertEqual(smartproxy.process_request(request, self.spider), None) self.assertEqual(httpproxy.process_request(request, self.spider), None) - self.assertEqual(request.meta['proxy'], 'http://proxy.example.com:8011') - self.assertEqual(request.headers[b'Proxy-Authorization'], auth_header) + self.assertEqual(request.meta["proxy"], "http://proxy.example.com:8011") + self.assertEqual(request.headers[b"Proxy-Authorization"], auth_header) From 47ed9a48e52858f86668f930cde48e2de7bb5961 Mon Sep 17 00:00:00 2001 From: Emmanuel Rondan Date: Thu, 29 Aug 2024 10:43:30 -0300 Subject: [PATCH 3/8] ignoring pre-commit hooks applied in blame --- .git-blame-ignore-revs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index e746ff9..1befd81 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -1 +1,2 @@ -# applying pre-commit hooks to the project \ No newline at end of file +# applying pre-commit hooks to the project +951f357d3f60257618f7174f6ad39e9406441ecb \ No newline at end of file From a8e298cded599fb1c05d38224925d33778cc1e7b Mon Sep 17 00:00:00 2001 From: Emmanuel Rondan Date: Thu, 7 Nov 2024 09:50:04 -0300 Subject: [PATCH 4/8] adding pre-commit config files --- .bandit.yml | 5 +++- .git-blame-ignore-revs | 2 ++ .github/workflows/main.yml | 19 ++++-------- .isort.cfg | 2 ++ .pre-commit-config.yaml | 14 ++++++++- setup.py | 61 +++++++++++++++++--------------------- tox.ini | 21 +++---------- 7 files changed, 58 insertions(+), 66 deletions(-) create mode 100644 .git-blame-ignore-revs create mode 100644 .isort.cfg diff --git a/.bandit.yml b/.bandit.yml index 2237265..9308817 100644 --- a/.bandit.yml +++ b/.bandit.yml @@ -1,3 +1,6 @@ skips: - B101 # assert_used, needed for mypy -exclude_dirs: ['tests'] +- B311 +- B320 +- B410 +exclude_dirs: ['tests'] \ No newline at end of file diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000..e9c6069 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,2 @@ +# applying pre-commit hooks to the project +e00df278aa8602b18e7c3525191b843d88334c8f \ No newline at end of file diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b4eb986..6f4c740 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -17,20 +17,6 @@ jobs: - python-version: "2.7" env: TOXENV: py - - python-version: "3.4" - env: - TOXENV: py34 - # 3.5 cannot be tested in CI - # https://github.com/MatteoH2O1999/setup-python/issues/49#issuecomment-2209940822 - - python-version: "3.6" - env: - TOXENV: py - - python-version: "3.7" - env: - TOXENV: py - - python-version: "3.8" - env: - TOXENV: py - python-version: "3.9" env: TOXENV: py @@ -76,3 +62,8 @@ jobs: run: tox - name: Upload coverage.xml to codecov uses: codecov/codecov-action@v1 + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: pre-commit/action@v3.0.0 diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 0000000..6860bdb --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,2 @@ +[settings] +profile = black \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 32465ff..3503b3a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,18 @@ repos: - repo: https://github.com/PyCQA/bandit - rev: 1.7.10 + rev: 1.7.9 hooks: - id: bandit args: [-r, -c, .bandit.yml] +- repo: https://github.com/psf/black.git + rev: 24.8.0 + hooks: + - id: black +- repo: https://github.com/PyCQA/flake8 + rev: 7.1.1 + hooks: + - id: flake8 +- repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort \ No newline at end of file diff --git a/setup.py b/setup.py index bd620ed..085983c 100644 --- a/setup.py +++ b/setup.py @@ -5,41 +5,36 @@ setup( - name='scrapy-zyte-smartproxy', - version='2.3.5', - license='BSD', - description='Scrapy middleware for Zyte Smart Proxy Manager', + name="scrapy-zyte-smartproxy", + version="2.3.5", + license="BSD", + description="Scrapy middleware for Zyte Smart Proxy Manager", long_description=readme, long_description_content_type="text/x-rst", - maintainer='Raul Gallegos', - maintainer_email='raul.ogh@gmail.com', - author='Zyte', - author_email='opensource@zyte.com', - url='https://github.com/scrapy-plugins/scrapy-zyte-smartproxy', - packages=['scrapy_zyte_smartproxy'], - platforms=['Any'], + maintainer="Raul Gallegos", + maintainer_email="raul.ogh@gmail.com", + author="Zyte", + author_email="opensource@zyte.com", + url="https://github.com/scrapy-plugins/scrapy-zyte-smartproxy", + packages=["scrapy_zyte_smartproxy"], + platforms=["Any"], classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'License :: OSI Approved :: BSD License', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Programming Language :: Python :: 3.12', - 'Programming Language :: Python :: 3.13', - 'Framework :: Scrapy', - 'Intended Audience :: Developers', - 'Topic :: Internet :: WWW/HTTP', - 'Topic :: Internet :: Proxy Servers', - 'Topic :: Software Development :: Libraries :: Application Frameworks', - 'Topic :: Software Development :: Libraries :: Python Modules', + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Framework :: Scrapy", + "Intended Audience :: Developers", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Internet :: Proxy Servers", + "Topic :: Software Development :: Libraries :: Application Frameworks", + "Topic :: Software Development :: Libraries :: Python Modules", ], - install_requires=['scrapy>=1.4.0', 'six', 'w3lib'], + install_requires=["scrapy>=1.4.0", "six", "w3lib"], ) diff --git a/tox.ini b/tox.ini index 2985068..ba4015d 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,6 @@ # tox.ini [tox] -envlist = pre-commit,mypy,min,py27,py34,py35,py36,py37,py38,py39,py310,py311,py312,py313,docs +envlist = pre-commit,mypy,min,py27,py39,py310,py311,py312,py313,docs [testenv] deps = @@ -10,11 +10,9 @@ commands = py.test --doctest-modules --cov=scrapy_zyte_smartproxy {posargs:scrapy_zyte_smartproxy tests} [testenv:pre-commit] -basepython = python3 -deps = - pre-commit -commands = - pre-commit run {posargs:--all-files} +deps = pre-commit +commands = pre-commit run --all-files --show-diff-on-failure +skip_install = true [testenv:mypy] basepython = python3.10 @@ -37,17 +35,6 @@ deps = w3lib==1.17.0 -rtests/requirements.txt -[testenv:py34] -basepython = python3.4 -deps = - Scrapy - six - # Latest Twisted that does not install an embedded version of incremental - # that is incompatible with Python 3.4. - Twisted==16.4.1 - w3lib - -rtests/requirements.txt - [testenv:security] deps = bandit From 05665a6fb1717ef513d7a8ac87b8eb499a64cdc9 Mon Sep 17 00:00:00 2001 From: Emmanuel Rondan Date: Thu, 7 Nov 2024 10:00:25 -0300 Subject: [PATCH 5/8] applying pre-commit hooks --- docs/conf.py | 67 +-- scrapy_zyte_smartproxy/__init__.py | 4 +- scrapy_zyte_smartproxy/middleware.py | 313 +++++++----- scrapy_zyte_smartproxy/utils.py | 5 +- tests/test_all.py | 725 +++++++++++++++------------ 5 files changed, 620 insertions(+), 494 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index dff5539..99b7141 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,12 +12,13 @@ # All configuration values have a default; values that are commented out # serve to show the default. +import sys + # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # from os import path -import sys sys.path.insert(0, path.dirname(path.dirname(__file__))) @@ -34,25 +35,25 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autosectionlabel', + "sphinx.ext.autosectionlabel", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = {'.rst': 'restructuredtext'} +source_suffix = {".rst": "restructuredtext"} # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'scrapy-zyte-smartproxy' -copyright = u'2011-2021, Zyte Group Ltd' -author = u'Zyte' +project = "scrapy-zyte-smartproxy" +copyright = "2011-2021, Zyte Group Ltd" +author = "Zyte" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -61,19 +62,20 @@ try: import scrapy_zyte_smartproxy - version = '.'.join(scrapy_zyte_smartproxy.__version__.split('.')[:2]) + + version = ".".join(scrapy_zyte_smartproxy.__version__.split(".")[:2]) release = scrapy_zyte_smartproxy.__version__ except ImportError: - version = '' - release = '' + version = "" + release = "" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -94,13 +96,13 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -#html_static_path = ['_static'] +# html_static_path = ['_static'] # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. -htmlhelp_basename = 'scrapy-zyte-smartproxydoc' +htmlhelp_basename = "scrapy-zyte-smartproxydoc" # -- Options for LaTeX output --------------------------------------------- @@ -109,15 +111,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -129,10 +128,10 @@ latex_documents = [ ( master_doc, - 'scrapy-zyte-smartproxy.tex', - u'scrapy-zyte-smartproxy Documentation', - u'Zyte', - 'manual', + "scrapy-zyte-smartproxy.tex", + "scrapy-zyte-smartproxy Documentation", + "Zyte", + "manual", ), ] @@ -142,8 +141,13 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'scrapy-zyte-smartproxy', u'scrapy-zyte-smartproxy Documentation', - [author], 1) + ( + master_doc, + "scrapy-zyte-smartproxy", + "scrapy-zyte-smartproxy Documentation", + [author], + 1, + ) ] @@ -153,10 +157,13 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'scrapy-zyte-smartproxy', u'scrapy-zyte-smartproxy Documentation', - author, 'scrapy-zyte-smartproxy', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "scrapy-zyte-smartproxy", + "scrapy-zyte-smartproxy Documentation", + author, + "scrapy-zyte-smartproxy", + "One line description of project.", + "Miscellaneous", + ), ] - - - diff --git a/scrapy_zyte_smartproxy/__init__.py b/scrapy_zyte_smartproxy/__init__.py index 3b44f47..84d3e49 100644 --- a/scrapy_zyte_smartproxy/__init__.py +++ b/scrapy_zyte_smartproxy/__init__.py @@ -1,4 +1,4 @@ from .middleware import ZyteSmartProxyMiddleware - -__version__ = '2.3.5' +__version__ = "2.3.5" +__all__ = ["ZyteSmartProxyMiddleware"] diff --git a/scrapy_zyte_smartproxy/middleware.py b/scrapy_zyte_smartproxy/middleware.py index ddb9571..9b0f5fe 100644 --- a/scrapy_zyte_smartproxy/middleware.py +++ b/scrapy_zyte_smartproxy/middleware.py @@ -1,24 +1,23 @@ -import os import logging +import os import warnings from base64 import urlsafe_b64decode from collections import defaultdict -from typing import Dict, List + try: from urllib.request import _parse_proxy # type: ignore except ImportError: from urllib2 import _parse_proxy # type: ignore -from six.moves.urllib.parse import urlparse, urlunparse -from w3lib.http import basic_auth_header from scrapy import signals -from scrapy.resolver import dnscache from scrapy.exceptions import ScrapyDeprecationWarning -from twisted.internet.error import ConnectionRefusedError, ConnectionDone +from scrapy.resolver import dnscache +from six.moves.urllib.parse import urlparse, urlunparse +from twisted.internet.error import ConnectionDone, ConnectionRefusedError +from w3lib.http import basic_auth_header from scrapy_zyte_smartproxy.utils import exp_backoff - logger = logging.getLogger(__name__) @@ -29,16 +28,16 @@ def _remove_auth(auth_proxy_url): class ZyteSmartProxyMiddleware(object): - url = 'http://proxy.zyte.com:8011' + url = "http://proxy.zyte.com:8011" maxbans = 400 ban_code = 503 download_timeout = 190 # Handle Zyte Smart Proxy Manager server failures connection_refused_delay = 90 preserve_delay = False - header_prefix = 'X-Crawlera-' # Deprecated - header_lowercase_prefixes = ('zyte-', 'x-crawlera-') - conflicting_headers = ('X-Crawlera-Profile', 'X-Crawlera-UA') + header_prefix = "X-Crawlera-" # Deprecated + header_lowercase_prefixes = ("zyte-", "x-crawlera-") + conflicting_headers = ("X-Crawlera-Profile", "X-Crawlera-UA") backoff_step = 15 backoff_max = 180 exp_backoff = None @@ -52,22 +51,24 @@ class ZyteSmartProxyMiddleware(object): b"zyte-jobid": b"x-crawlera-jobid", b"zyte-override-headers": b"x-crawlera-profile-pass", } - spm_to_zyte_api_translations = {v: k for k, v in zyte_api_to_spm_translations.items()} + spm_to_zyte_api_translations = { + v: k for k, v in zyte_api_to_spm_translations.items() + } _settings = [ - ('apikey', str), - ('url', str), - ('maxbans', int), - ('download_timeout', int), - ('preserve_delay', bool), - ('backoff_step', int), - ('backoff_max', int), - ('force_enable_on_http_codes', list), + ("apikey", str), + ("url", str), + ("maxbans", int), + ("download_timeout", int), + ("preserve_delay", bool), + ("backoff_step", int), + ("backoff_max", int), + ("force_enable_on_http_codes", list), ] def __init__(self, crawler): self.crawler = crawler - self.job_id = os.environ.get('SCRAPY_JOB') + self.job_id = os.environ.get("SCRAPY_JOB") self.spider = None self._bans = defaultdict(int) self._saved_delays = defaultdict(lambda: None) @@ -85,14 +86,14 @@ def from_crawler(cls, crawler): def _make_auth_url(self, spider): parsed_url = urlparse(self.url) auth = self.get_proxyauth(spider) - if not auth.startswith(b'Basic '): + if not auth.startswith(b"Basic "): raise ValueError( - 'Zyte proxy services only support HTTP basic access ' - 'authentication, but %s.%s.get_proxyauth() returned %r' + "Zyte proxy services only support HTTP basic access " + "authentication, but %s.%s.get_proxyauth() returned %r" % (self.__module__, self.__class__.__name__, auth) ) - user_and_colon = urlsafe_b64decode(auth[6:].strip()).decode('utf-8') - netloc = user_and_colon + '@' + parsed_url.netloc.split('@')[-1] + user_and_colon = urlsafe_b64decode(auth[6:].strip()).decode("utf-8") + netloc = user_and_colon + "@" + parsed_url.netloc.split("@")[-1] parsed_url = parsed_url._replace(netloc=netloc) return urlunparse(parsed_url) @@ -104,7 +105,9 @@ def open_spider(self, spider): setattr(self, k, self._get_setting_value(spider, k, type_)) self._fix_url_protocol() - self._headers = self.crawler.settings.get('ZYTE_SMARTPROXY_DEFAULT_HEADERS', {}).items() + self._headers = self.crawler.settings.get( + "ZYTE_SMARTPROXY_DEFAULT_HEADERS", {} + ).items() self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max) if not self.enabled and not self.force_enable_on_http_codes: @@ -113,7 +116,7 @@ def open_spider(self, spider): if not self.apikey: logger.warning( "Zyte proxy services cannot be used without an API key", - extra={'spider': spider}, + extra={"spider": spider}, ) return @@ -121,10 +124,9 @@ def open_spider(self, spider): self._authless_url = _remove_auth(self._auth_url) logger.info( - "Using Zyte proxy service %s with an API key ending in %s" % ( - self.url, self.apikey[:7] - ), - extra={'spider': spider}, + "Using Zyte proxy service %s with an API key ending in %s" + % (self.url, self.apikey[:7]), + extra={"spider": spider}, ) if not self.preserve_delay: @@ -136,7 +138,7 @@ def open_spider(self, spider): "To avoid this behaviour you can use the " "ZYTE_SMARTPROXY_PRESERVE_DELAY setting, but keep in mind " "that this may slow down the crawl significantly", - extra={'spider': spider}, + extra={"spider": spider}, ) def _settings_get(self, type_, *a, **kw): @@ -152,49 +154,69 @@ def _settings_get(self, type_, *a, **kw): return self.crawler.settings.get(*a, **kw) def _get_setting_value(self, spider, k, type_): - if hasattr(spider, 'hubproxy_' + k): - warnings.warn('hubproxy_%s attribute is deprecated, ' - 'use zyte_smartproxy_%s instead.' % (k, k), - category=ScrapyDeprecationWarning, stacklevel=1) + if hasattr(spider, "hubproxy_" + k): + warnings.warn( + "hubproxy_%s attribute is deprecated, " + "use zyte_smartproxy_%s instead." % (k, k), + category=ScrapyDeprecationWarning, + stacklevel=1, + ) - if self.crawler.settings.get('HUBPROXY_%s' % k.upper()) is not None: - warnings.warn('HUBPROXY_%s setting is deprecated, ' - 'use ZYTE_SMARTPROXY_%s instead.' % (k.upper(), k.upper()), - category=ScrapyDeprecationWarning, stacklevel=1) + if self.crawler.settings.get("HUBPROXY_%s" % k.upper()) is not None: + warnings.warn( + "HUBPROXY_%s setting is deprecated, " + "use ZYTE_SMARTPROXY_%s instead." % (k.upper(), k.upper()), + category=ScrapyDeprecationWarning, + stacklevel=1, + ) o = getattr(self, k, None) s = self._settings_get( - type_, 'ZYTE_SMARTPROXY_' + k.upper(), self._settings_get( - type_, 'HUBPROXY_' + k.upper(), o)) + type_, + "ZYTE_SMARTPROXY_" + k.upper(), + self._settings_get(type_, "HUBPROXY_" + k.upper(), o), + ) return getattr( - spider, 'zyte_smartproxy_' + k, getattr(spider, 'hubproxy_' + k, s)) + spider, "zyte_smartproxy_" + k, getattr(spider, "hubproxy_" + k, s) + ) def _fix_url_protocol(self): - if self.url.startswith('https://'): - logger.warning('ZYTE_SMARTPROXY_URL "%s" set with "https://" protocol.' % self.url) - elif not self.url.startswith('http://'): + if self.url.startswith("https://"): + logger.warning( + 'ZYTE_SMARTPROXY_URL "%s" set with "https://" protocol.' % self.url + ) + elif not self.url.startswith("http://"): logger.warning('Adding "http://" to ZYTE_SMARTPROXY_URL %s' % self.url) - self.url = 'http://' + self.url + self.url = "http://" + self.url def is_enabled(self, spider): """Hook to enable middleware by custom rules.""" - if hasattr(spider, 'use_hubproxy'): - warnings.warn('use_hubproxy attribute is deprecated, ' - 'use zyte_smartproxy_enabled instead.', - category=ScrapyDeprecationWarning, stacklevel=1) - - if self.crawler.settings.get('HUBPROXY_ENABLED') is not None: - warnings.warn('HUBPROXY_ENABLED setting is deprecated, ' - 'use ZYTE_SMARTPROXY_ENABLED instead.', - category=ScrapyDeprecationWarning, stacklevel=1) - return ( - getattr(spider, 'zyte_smartproxy_enabled', self.crawler.settings.getbool('ZYTE_SMARTPROXY_ENABLED')) or - getattr(spider, 'use_hubproxy', self.crawler.settings.getbool("HUBPROXY_ENABLED")) + if hasattr(spider, "use_hubproxy"): + warnings.warn( + "use_hubproxy attribute is deprecated, " + "use zyte_smartproxy_enabled instead.", + category=ScrapyDeprecationWarning, + stacklevel=1, + ) + + if self.crawler.settings.get("HUBPROXY_ENABLED") is not None: + warnings.warn( + "HUBPROXY_ENABLED setting is deprecated, " + "use ZYTE_SMARTPROXY_ENABLED instead.", + category=ScrapyDeprecationWarning, + stacklevel=1, + ) + return getattr( + spider, + "zyte_smartproxy_enabled", + self.crawler.settings.getbool("ZYTE_SMARTPROXY_ENABLED"), + ) or getattr( + spider, "use_hubproxy", self.crawler.settings.getbool("HUBPROXY_ENABLED") ) def get_proxyauth(self, spider): """Hook to compute Proxy-Authorization header by custom rules.""" - return basic_auth_header(self.apikey, '') + return basic_auth_header(self.apikey, "") def _targets_zyte_api(self, request): if self._auth_url is None: @@ -208,7 +230,8 @@ def _targets_zyte_api(self, request): def _translate_headers(self, request, targets_zyte_api): translation_dict = ( - self.spm_to_zyte_api_translations if targets_zyte_api + self.spm_to_zyte_api_translations + if targets_zyte_api else self.zyte_api_to_spm_translations ) for header, translation in translation_dict.items(): @@ -229,10 +252,10 @@ def _inc_stat(self, stat, targets_zyte_api, value=1): def process_request(self, request, spider): if self._is_enabled_for_request(request): - if 'proxy' not in request.meta: - request.meta['proxy'] = self._auth_url + if "proxy" not in request.meta: + request.meta["proxy"] = self._auth_url elif ( - request.meta['proxy'] == self._authless_url + request.meta["proxy"] == self._authless_url and b"Proxy-Authorization" not in request.headers ): logger.warning( @@ -243,55 +266,60 @@ def process_request(self, request, spider): "middlewares from one request to another is a bad " "practice that can cause issues.".format(request=request) ) - request.meta['proxy'] = self._auth_url + request.meta["proxy"] = self._auth_url targets_zyte_api = self._targets_zyte_api(request) self._set_zyte_smartproxy_default_headers(request) - request.meta['download_timeout'] = self.download_timeout + request.meta["download_timeout"] = self.download_timeout if self.job_id: - job_header = 'Zyte-JobId' if targets_zyte_api else 'X-Crawlera-JobId' + job_header = "Zyte-JobId" if targets_zyte_api else "X-Crawlera-JobId" request.headers[job_header] = self.job_id - user_agent_header = "Zyte-Client" if targets_zyte_api else "X-Crawlera-Client" + user_agent_header = ( + "Zyte-Client" if targets_zyte_api else "X-Crawlera-Client" + ) from scrapy_zyte_smartproxy import __version__ - request.headers[user_agent_header] = 'scrapy-zyte-smartproxy/%s' % __version__ + + request.headers[user_agent_header] = ( + "scrapy-zyte-smartproxy/%s" % __version__ + ) self._inc_stat("request", targets_zyte_api=targets_zyte_api) - self._inc_stat("request/method/{}".format(request.method), targets_zyte_api=targets_zyte_api) + self._inc_stat( + "request/method/{}".format(request.method), + targets_zyte_api=targets_zyte_api, + ) self._translate_headers(request, targets_zyte_api=targets_zyte_api) - self._clean_zyte_smartproxy_headers(request, targets_zyte_api=targets_zyte_api) + self._clean_zyte_smartproxy_headers( + request, targets_zyte_api=targets_zyte_api + ) else: self._clean_zyte_smartproxy_headers(request) def _is_banned(self, response): return ( response.status == self.ban_code - and response.headers.get('X-Crawlera-Error') == b'banned' - ) or ( - response.status in {520, 521} - and response.headers.get('Zyte-Error') - ) + and response.headers.get("X-Crawlera-Error") == b"banned" + ) or (response.status in {520, 521} and response.headers.get("Zyte-Error")) def _is_auth_error(self, response): return ( - response.status == 407 and - response.headers.get('X-Crawlera-Error') == b'bad_proxy_auth' + response.status == 407 + and response.headers.get("X-Crawlera-Error") == b"bad_proxy_auth" ) def _throttle_error(self, response): - error = response.headers.get('Zyte-Error') or response.headers.get('X-Crawlera-Error') - if ( - response.status in {429, 503} - and error - and error != b"banned" - ): + error = response.headers.get("Zyte-Error") or response.headers.get( + "X-Crawlera-Error" + ) + if response.status in {429, 503} and error and error != b"banned": return error.decode() return None def _process_error(self, response): if "Zyte-Error" in response.headers: - value = response.headers.get('Zyte-Error') + value = response.headers.get("Zyte-Error") response.headers["X-Crawlera-Error"] = value return value if "X-Crawlera-Error" in response.headers: - value = response.headers.get('X-Crawlera-Error') + value = response.headers.get("X-Crawlera-Error") response.headers["Zyte-Error"] = value return value return None @@ -302,7 +330,9 @@ def process_response(self, request, response, spider): targets_zyte_api = self._targets_zyte_api(request) if not self._is_enabled_for_request(request): - return self._handle_not_enabled_response(request, response, targets_zyte_api=targets_zyte_api) + return self._handle_not_enabled_response( + request, response, targets_zyte_api=targets_zyte_api + ) if not self._is_zyte_smartproxy_or_zapi_response(response): return response @@ -314,11 +344,16 @@ def process_response(self, request, response, spider): throttle_error = self._throttle_error(response) if is_auth_error or throttle_error: if is_auth_error: - reason = 'autherror' + reason = "autherror" else: assert throttle_error reason = throttle_error.lstrip("/") - self._set_custom_delay(request, next(self.exp_backoff), reason=reason, targets_zyte_api=targets_zyte_api) + self._set_custom_delay( + request, + next(self.exp_backoff), + reason=reason, + targets_zyte_api=targets_zyte_api, + ) else: self._inc_stat("delay/reset_backoff", targets_zyte_api=targets_zyte_api) self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max) @@ -326,35 +361,49 @@ def process_response(self, request, response, spider): if is_auth_error: # When Zyte Smart Proxy Manager has issues it might not be able to # authenticate users we must retry - retries = request.meta.get('zyte_smartproxy_auth_retry_times', 0) + retries = request.meta.get("zyte_smartproxy_auth_retry_times", 0) if retries < self.max_auth_retry_times: - return self._retry_auth(response, request, spider, targets_zyte_api=targets_zyte_api) + return self._retry_auth( + response, request, spider, targets_zyte_api=targets_zyte_api + ) else: - self._inc_stat("retries/auth/max_reached", targets_zyte_api=targets_zyte_api) + self._inc_stat( + "retries/auth/max_reached", targets_zyte_api=targets_zyte_api + ) logger.warning( "Max retries for authentication issues reached, please check auth" " information settings", - extra={'spider': self.spider}, + extra={"spider": self.spider}, ) if self._is_banned(response): self._bans[key] += 1 if self._bans[key] > self.maxbans: - self.crawler.engine.close_spider(spider, 'banned') + self.crawler.engine.close_spider(spider, "banned") else: - after = response.headers.get('retry-after') + after = response.headers.get("retry-after") if after: - self._set_custom_delay(request, float(after), reason='banned', targets_zyte_api=targets_zyte_api) + self._set_custom_delay( + request, + float(after), + reason="banned", + targets_zyte_api=targets_zyte_api, + ) self._inc_stat("response/banned", targets_zyte_api=targets_zyte_api) else: self._bans[key] = 0 # If placed behind `RedirectMiddleware`, it would not count 3xx responses self._inc_stat("response", targets_zyte_api=targets_zyte_api) - self._inc_stat("response/status/{}".format(response.status), targets_zyte_api=targets_zyte_api) + self._inc_stat( + "response/status/{}".format(response.status), + targets_zyte_api=targets_zyte_api, + ) if zyte_smartproxy_error: self._inc_stat("response/error", targets_zyte_api=targets_zyte_api) - error_msg = zyte_smartproxy_error.decode('utf8') - self._inc_stat("response/error/{}".format(error_msg), targets_zyte_api=targets_zyte_api) + error_msg = zyte_smartproxy_error.decode("utf8") + self._inc_stat( + "response/error/{}".format(error_msg), targets_zyte_api=targets_zyte_api + ) return response def process_exception(self, request, exception, spider): @@ -364,7 +413,12 @@ def process_exception(self, request, exception, spider): # Handle Zyte Smart Proxy Manager downtime self._clear_dns_cache() targets_zyte_api = self._targets_zyte_api(request) - self._set_custom_delay(request, self.connection_refused_delay, reason='conn_refused', targets_zyte_api=targets_zyte_api) + self._set_custom_delay( + request, + self.connection_refused_delay, + reason="conn_refused", + targets_zyte_api=targets_zyte_api, + ) def _handle_not_enabled_response(self, request, response, targets_zyte_api): if self._should_enable_for_response(response): @@ -373,7 +427,9 @@ def _handle_not_enabled_response(self, request, response, targets_zyte_api): retryreq = request.copy() retryreq.dont_filter = True - self._inc_stat("retries/should_have_been_enabled", targets_zyte_api=targets_zyte_api) + self._inc_stat( + "retries/should_have_been_enabled", targets_zyte_api=targets_zyte_api + ) return retryreq return response @@ -383,11 +439,11 @@ def _retry_auth(self, response, request, spider, targets_zyte_api): "Retrying a request due to an authentication issue with " "the configured Zyte proxy service" ), - extra={'spider': self.spider}, + extra={"spider": self.spider}, ) - retries = request.meta.get('zyte_smartproxy_auth_retry_times', 0) + 1 + retries = request.meta.get("zyte_smartproxy_auth_retry_times", 0) + 1 retryreq = request.copy() - retryreq.meta['zyte_smartproxy_auth_retry_times'] = retries + retryreq.meta["zyte_smartproxy_auth_retry_times"] = retries retryreq.dont_filter = True self._inc_stat("retries/auth", targets_zyte_api=targets_zyte_api) return retryreq @@ -403,7 +459,7 @@ def _should_enable_for_response(self, response): def _is_enabled_for_request(self, request): domain = self._get_url_domain(request.url) domain_enabled = self.enabled_for_domain.get(domain, False) - dont_proxy = request.meta.get('dont_proxy', False) + dont_proxy = request.meta.get("dont_proxy", False) return (domain_enabled or self.enabled) and not dont_proxy def _get_url_domain(self, url): @@ -418,7 +474,7 @@ def _is_zyte_smartproxy_or_zapi_response(self, response): ) def _get_slot_key(self, request): - return request.meta.get('download_slot') + return request.meta.get("download_slot") def _get_slot(self, request): key = self._get_slot_key(request) @@ -434,7 +490,11 @@ def _set_custom_delay(self, request, delay, targets_zyte_api, reason=None): slot.delay = delay if reason is not None: self._inc_stat("delay/{}".format(reason), targets_zyte_api=targets_zyte_api) - self._inc_stat("delay/{}/total".format(reason), value=delay, targets_zyte_api=targets_zyte_api) + self._inc_stat( + "delay/{}/total".format(reason), + value=delay, + targets_zyte_api=targets_zyte_api, + ) def _restore_original_delay(self, request): """Restore original delay for slot if it was changed.""" @@ -449,9 +509,9 @@ def _clean_zyte_smartproxy_headers(self, request, targets_zyte_api=None): if targets_zyte_api is None: prefixes = self.header_lowercase_prefixes elif targets_zyte_api: - prefixes = ('x-crawlera-',) + prefixes = ("x-crawlera-",) else: - prefixes = ('zyte-',) + prefixes = ("zyte-",) targets = [ header for header in request.headers @@ -471,7 +531,7 @@ def _clean_zyte_smartproxy_headers(self, request, targets_zyte_api=None): "request is proxied with %s and not with %s, and " "automatic translation is not supported for this " "header. See " - "https://docs.zyte.com/zyte-api/migration/zyte/smartproxy.html#parameter-mapping" + "https://docs.zyte.com/zyte-api/migration/zyte/smartproxy.html#parameter-mapping" # noqa " to learn the right way to translate this header " "manually." ), @@ -485,11 +545,8 @@ def _clean_zyte_smartproxy_headers(self, request, targets_zyte_api=None): def _is_zyte_smartproxy_header(self, header_name, prefixes): if not header_name: return False - header_name = header_name.decode('utf-8').lower() - return any( - header_name.startswith(prefix) - for prefix in prefixes - ) + header_name = header_name.decode("utf-8").lower() + return any(header_name.startswith(prefix) for prefix in prefixes) def _set_zyte_smartproxy_default_headers(self, request): for header, value in self._headers: @@ -497,23 +554,21 @@ def _set_zyte_smartproxy_default_headers(self, request): continue request.headers.setdefault(header, value) lower_case_headers = [ - header.decode('utf-8').lower() for header in request.headers + header.decode("utf-8").lower() for header in request.headers ] if all(h.lower() in lower_case_headers for h in self.conflicting_headers): # Send a general warning once, and specific urls if LOG_LEVEL = DEBUG warnings.warn( - 'The headers %s are conflicting on some of your requests. ' - 'Please check ' - 'https://docs.zyte.com/smart-proxy-manager.html#request-headers ' - 'for more information. You can set LOG_LEVEL=DEBUG to see the ' - 'urls with problems.' - % str(self.conflicting_headers) + "The headers %s are conflicting on some of your requests. " + "Please check " + "https://docs.zyte.com/smart-proxy-manager.html#request-headers " + "for more information. You can set LOG_LEVEL=DEBUG to see the " + "urls with problems." % str(self.conflicting_headers) ) logger.debug( - 'The headers %s are conflicting on request %s. X-Crawlera-UA ' - 'will be ignored. Please check ' - 'https://docs.zyte.com/smart-proxy-manager.html#request-headers ' - 'for more information' - % (str(self.conflicting_headers), request.url), - extra={'spider': self.spider}, + "The headers %s are conflicting on request %s. X-Crawlera-UA " + "will be ignored. Please check " + "https://docs.zyte.com/smart-proxy-manager.html#request-headers " + "for more information" % (str(self.conflicting_headers), request.url), + extra={"spider": self.spider}, ) diff --git a/scrapy_zyte_smartproxy/utils.py b/scrapy_zyte_smartproxy/utils.py index fa37659..d6a49cc 100644 --- a/scrapy_zyte_smartproxy/utils.py +++ b/scrapy_zyte_smartproxy/utils.py @@ -1,16 +1,15 @@ import math import random - from itertools import count def exp_backoff(step, max): - """ Exponential backoff time with Full Jitter """ + """Exponential backoff time with Full Jitter""" # this is a numerically stable version of # random.uniform(0, min(max, step * 2 ** attempt)) max_attempts = math.log(max / step, 2) for attempt in count(0, 1): if attempt <= max_attempts: - yield random.uniform(0, step * 2 ** attempt) # nosec + yield random.uniform(0, step * 2**attempt) # nosec else: yield random.uniform(0, max) # nosec diff --git a/tests/test_all.py b/tests/test_all.py index bedc8c6..cd400d4 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -1,26 +1,26 @@ import binascii import os -import pytest from copy import copy from random import choice from unittest import TestCase + +import pytest + try: from unittest.mock import call, patch # type: ignore except ImportError: from mock import call, patch # type: ignore -from w3lib.http import basic_auth_header from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware +from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.http import Request, Response +from scrapy.resolver import dnscache from scrapy.spiders import Spider from scrapy.utils.test import get_crawler -from scrapy.resolver import dnscache -from scrapy.exceptions import ScrapyDeprecationWarning -from twisted.internet.error import ConnectionRefusedError, ConnectionDone - -from scrapy_zyte_smartproxy import __version__, ZyteSmartProxyMiddleware -from scrapy_zyte_smartproxy.utils import exp_backoff +from twisted.internet.error import ConnectionDone, ConnectionRefusedError +from w3lib.http import basic_auth_header +from scrapy_zyte_smartproxy import ZyteSmartProxyMiddleware, __version__ RESPONSE_IDENTIFYING_HEADERS = ( ("X-Crawlera-Version", None), @@ -44,12 +44,14 @@ class ZyteSmartProxyMiddlewareTestCase(TestCase): auth_error_code = 407 def setUp(self): - self.spider = Spider('foo') - self.settings = {'ZYTE_SMARTPROXY_APIKEY': 'apikey'} + self.spider = Spider("foo") + self.settings = {"ZYTE_SMARTPROXY_APIKEY": "apikey"} Response_init_orig = Response.__init__ def Response_init_new(self, *args, **kwargs): - assert not kwargs.get('request'), 'response objects at this stage shall not be pinned' + assert not kwargs.get( + "request" + ), "response objects at this stage shall not be pinned" return Response_init_orig(self, *args, **kwargs) Response.__init__ = Response_init_new @@ -81,62 +83,67 @@ def _assert_disabled(self, spider, settings=None): crawler = self._mock_crawler(spider, settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) - req = Request('http://example.com') + req = Request("http://example.com") out = mw.process_request(req, spider) self.assertEqual(out, None) - self.assertEqual(req.meta.get('proxy'), None) - self.assertEqual(req.meta.get('download_timeout'), None) - self.assertEqual(req.headers.get('Proxy-Authorization'), None) + self.assertEqual(req.meta.get("proxy"), None) + self.assertEqual(req.meta.get("download_timeout"), None) + self.assertEqual(req.headers.get("Proxy-Authorization"), None) res = Response(req.url) assert mw.process_response(req, res, spider) is res res = Response(req.url, status=mw.ban_code) assert mw.process_response(req, res, spider) is res - def _assert_enabled(self, spider, - settings=None, - proxyurl='http://proxy.zyte.com:8011', - proxyurlcreds='http://apikey:@proxy.zyte.com:8011', - proxyauth=basic_auth_header('apikey', ''), - maxbans=400, - download_timeout=190): + def _assert_enabled( + self, + spider, + settings=None, + proxyurl="http://proxy.zyte.com:8011", + proxyurlcreds="http://apikey:@proxy.zyte.com:8011", + proxyauth=basic_auth_header("apikey", ""), + maxbans=400, + download_timeout=190, + ): crawler = self._mock_crawler(spider, settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) assert mw.url == proxyurl - req = Request('http://example.com') + req = Request("http://example.com") assert mw.process_request(req, spider) is None - self.assertEqual(req.meta.get('proxy'), proxyurlcreds) - self.assertEqual(req.meta.get('download_timeout'), download_timeout) - self.assertNotIn(b'Proxy-Authorization', req.headers) + self.assertEqual(req.meta.get("proxy"), proxyurlcreds) + self.assertEqual(req.meta.get("download_timeout"), download_timeout) + self.assertNotIn(b"Proxy-Authorization", req.headers) res = self._mock_zyte_smartproxy_response(req.url) assert mw.process_response(req, res, spider) is res # disabled if 'dont_proxy=True' is set - req = Request('http://example.com') - req.meta['dont_proxy'] = True + req = Request("http://example.com") + req.meta["dont_proxy"] = True assert mw.process_request(req, spider) is None assert httpproxy.process_request(req, spider) is None - self.assertEqual(req.meta.get('proxy'), None) - self.assertEqual(req.meta.get('download_timeout'), None) - self.assertNotIn(b'Proxy-Authorization', req.headers) + self.assertEqual(req.meta.get("proxy"), None) + self.assertEqual(req.meta.get("download_timeout"), None) + self.assertNotIn(b"Proxy-Authorization", req.headers) res = self._mock_zyte_smartproxy_response(req.url) assert mw.process_response(req, res, spider) is res - del req.meta['dont_proxy'] + del req.meta["dont_proxy"] assert mw.process_request(req, spider) is None assert httpproxy.process_request(req, spider) is None - self.assertEqual(req.meta.get('proxy'), proxyurl) - self.assertEqual(req.meta.get('download_timeout'), download_timeout) - self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth) + self.assertEqual(req.meta.get("proxy"), proxyurl) + self.assertEqual(req.meta.get("download_timeout"), download_timeout) + self.assertEqual(req.headers.get("Proxy-Authorization"), proxyauth) if maxbans > 0: # assert ban count is reseted after a succesful response - res = self._mock_zyte_smartproxy_response('http://banned.example', status=self.bancode) + res = self._mock_zyte_smartproxy_response( + "http://banned.example", status=self.bancode + ) assert mw.process_response(req, res, spider) is res self.assertEqual(crawler.engine.fake_spider_closed_result, None) - res = self._mock_zyte_smartproxy_response('http://unbanned.example') + res = self._mock_zyte_smartproxy_response("http://unbanned.example") assert mw.process_response(req, res, spider) is res self.assertEqual(crawler.engine.fake_spider_closed_result, None) self.assertEqual(mw._bans[None], 0) @@ -145,22 +152,22 @@ def _assert_enabled(self, spider, for x in range(maxbans + 1): self.assertEqual(crawler.engine.fake_spider_closed_result, None) res = self._mock_zyte_smartproxy_response( - 'http://banned.example/%d' % x, + "http://banned.example/%d" % x, status=self.bancode, - headers={'X-Crawlera-Error': 'banned'}, + headers={"X-Crawlera-Error": "banned"}, ) assert mw.process_response(req, res, spider) is res assert res.headers["X-Crawlera-Error"] == b"banned" assert res.headers["Zyte-Error"] == b"banned" # max bans reached and close_spider called - self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, 'banned')) + self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, "banned")) def test_disabled_by_lack_of_zyte_smartproxy_settings(self): self._assert_disabled(self.spider, settings={}) def test_spider_zyte_smartproxy_enabled(self): - self.assertFalse(hasattr(self.spider, 'zyte_smartproxy_enabled')) + self.assertFalse(hasattr(self.spider, "zyte_smartproxy_enabled")) self._assert_disabled(self.spider, self.settings) self.spider.zyte_smartproxy_enabled = True self._assert_enabled(self.spider, self.settings) @@ -169,89 +176,119 @@ def test_spider_zyte_smartproxy_enabled(self): def test_enabled(self): self._assert_disabled(self.spider, self.settings) - self.settings['ZYTE_SMARTPROXY_ENABLED'] = True + self.settings["ZYTE_SMARTPROXY_ENABLED"] = True self._assert_enabled(self.spider, self.settings) def test_spider_zyte_smartproxy_enabled_priority(self): self.spider.zyte_smartproxy_enabled = False - self.settings['ZYTE_SMARTPROXY_ENABLED'] = True + self.settings["ZYTE_SMARTPROXY_ENABLED"] = True self._assert_disabled(self.spider, self.settings) self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_ENABLED'] = False + self.settings["ZYTE_SMARTPROXY_ENABLED"] = False self._assert_enabled(self.spider, self.settings) self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_ENABLED'] = True + self.settings["ZYTE_SMARTPROXY_ENABLED"] = True self._assert_enabled(self.spider, self.settings) self.spider.zyte_smartproxy_enabled = False - self.settings['ZYTE_SMARTPROXY_ENABLED'] = False + self.settings["ZYTE_SMARTPROXY_ENABLED"] = False self._assert_disabled(self.spider, self.settings) def test_apikey(self): self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_APIKEY'] = apikey = 'apikey' - proxyauth = basic_auth_header(apikey, '') - self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth, proxyurlcreds='http://apikey:@proxy.zyte.com:8011') + self.settings["ZYTE_SMARTPROXY_APIKEY"] = apikey = "apikey" + proxyauth = basic_auth_header(apikey, "") + self._assert_enabled( + self.spider, + self.settings, + proxyauth=proxyauth, + proxyurlcreds="http://apikey:@proxy.zyte.com:8011", + ) - apikey = 'notfromsettings' - proxyauth = basic_auth_header(apikey, '') + apikey = "notfromsettings" + proxyauth = basic_auth_header(apikey, "") self.spider.zyte_smartproxy_apikey = apikey - self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth, proxyurlcreds='http://notfromsettings:@proxy.zyte.com:8011') + self._assert_enabled( + self.spider, + self.settings, + proxyauth=proxyauth, + proxyurlcreds="http://notfromsettings:@proxy.zyte.com:8011", + ) def test_proxyurl(self): self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_URL'] = 'http://localhost:8011' - self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8011', proxyurlcreds='http://apikey:@localhost:8011') + self.settings["ZYTE_SMARTPROXY_URL"] = "http://localhost:8011" + self._assert_enabled( + self.spider, + self.settings, + proxyurl="http://localhost:8011", + proxyurlcreds="http://apikey:@localhost:8011", + ) def test_proxyurl_no_protocol(self): self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_URL'] = 'localhost:8011' - self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8011', proxyurlcreds='http://apikey:@localhost:8011') + self.settings["ZYTE_SMARTPROXY_URL"] = "localhost:8011" + self._assert_enabled( + self.spider, + self.settings, + proxyurl="http://localhost:8011", + proxyurlcreds="http://apikey:@localhost:8011", + ) def test_proxyurl_https(self): self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_URL'] = 'https://localhost:8011' - self._assert_enabled(self.spider, self.settings, proxyurl='https://localhost:8011', proxyurlcreds='https://apikey:@localhost:8011') + self.settings["ZYTE_SMARTPROXY_URL"] = "https://localhost:8011" + self._assert_enabled( + self.spider, + self.settings, + proxyurl="https://localhost:8011", + proxyurlcreds="https://apikey:@localhost:8011", + ) def test_proxyurl_including_noconnect(self): self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_URL'] = 'http://localhost:8011?noconnect' - self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8011?noconnect', proxyurlcreds='http://apikey:@localhost:8011?noconnect') + self.settings["ZYTE_SMARTPROXY_URL"] = "http://localhost:8011?noconnect" + self._assert_enabled( + self.spider, + self.settings, + proxyurl="http://localhost:8011?noconnect", + proxyurlcreds="http://apikey:@localhost:8011?noconnect", + ) def test_maxbans(self): self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_MAXBANS'] = maxbans = 0 + self.settings["ZYTE_SMARTPROXY_MAXBANS"] = maxbans = 0 self._assert_enabled(self.spider, self.settings, maxbans=maxbans) - self.settings['ZYTE_SMARTPROXY_MAXBANS'] = maxbans = 100 + self.settings["ZYTE_SMARTPROXY_MAXBANS"] = maxbans = 100 self._assert_enabled(self.spider, self.settings, maxbans=maxbans) # Assert setting is coerced into correct type - self.settings['ZYTE_SMARTPROXY_MAXBANS'] = '123' + self.settings["ZYTE_SMARTPROXY_MAXBANS"] = "123" self._assert_enabled(self.spider, self.settings, maxbans=123) self.spider.zyte_smartproxy_maxbans = 99 self._assert_enabled(self.spider, self.settings, maxbans=99) def test_download_timeout(self): self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_DOWNLOAD_TIMEOUT'] = 60 + self.settings["ZYTE_SMARTPROXY_DOWNLOAD_TIMEOUT"] = 60 self._assert_enabled(self.spider, self.settings, download_timeout=60) # Assert setting is coerced into correct type - self.settings['ZYTE_SMARTPROXY_DOWNLOAD_TIMEOUT'] = '42' + self.settings["ZYTE_SMARTPROXY_DOWNLOAD_TIMEOUT"] = "42" self._assert_enabled(self.spider, self.settings, download_timeout=42) self.spider.zyte_smartproxy_download_timeout = 120 self._assert_enabled(self.spider, self.settings, download_timeout=120) def test_hooks(self): - proxyauth = basic_auth_header('foo', '') + proxyauth = basic_auth_header("foo", "") class _ECLS(self.mwcls): def is_enabled(self, spider): - wascalled.append('is_enabled') + wascalled.append("is_enabled") return enabled def get_proxyauth(self, spider): - wascalled.append('get_proxyauth') + wascalled.append("get_proxyauth") return proxyauth wascalled = [] @@ -261,19 +298,24 @@ def get_proxyauth(self, spider): enabled = False self.spider.zyte_smartproxy_enabled = True self._assert_disabled(self.spider, self.settings) - self.assertEqual(wascalled, ['is_enabled']) + self.assertEqual(wascalled, ["is_enabled"]) wascalled[:] = [] # reset enabled = True self.spider.zyte_smartproxy_enabled = False - self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth, proxyurlcreds='http://foo:@proxy.zyte.com:8011') - self.assertEqual(wascalled, ['is_enabled', 'get_proxyauth']) + self._assert_enabled( + self.spider, + self.settings, + proxyauth=proxyauth, + proxyurlcreds="http://foo:@proxy.zyte.com:8011", + ) + self.assertEqual(wascalled, ["is_enabled", "get_proxyauth"]) def test_delay_adjustment(self): delay = 0.5 - slot_key = 'example.com' - url = 'http://example.com' - ban_url = 'http://banned.example' + slot_key = "example.com" + url = "http://example.com" + ban_url = "http://banned.example" self.spider.zyte_smartproxy_enabled = True @@ -296,10 +338,10 @@ def test_delay_adjustment(self): crawler.engine.downloader.slots[slot_key] = slot # ban without retry-after - req = Request(url, meta={'download_slot': slot_key}) + req = Request(url, meta={"download_slot": slot_key}) assert mw.process_request(req, self.spider) is None assert httpproxy.process_request(req, self.spider) is None - headers = {'X-Crawlera-Error': 'banned'} + headers = {"X-Crawlera-Error": "banned"} res = self._mock_zyte_smartproxy_response( ban_url, status=self.bancode, @@ -311,10 +353,7 @@ def test_delay_adjustment(self): # ban with retry-after retry_after = 1.5 - headers = { - 'retry-after': str(retry_after), - 'X-Crawlera-Error': 'banned' - } + headers = {"retry-after": str(retry_after), "X-Crawlera-Error": "banned"} res = self._mock_zyte_smartproxy_response( ban_url, status=self.bancode, @@ -325,43 +364,43 @@ def test_delay_adjustment(self): self.assertEqual(self.spider.download_delay, delay) # DNS cache should be cleared in case of errors - dnscache['proxy.zyte.com'] = '1.1.1.1' + dnscache["proxy.zyte.com"] = "1.1.1.1" res = self._mock_zyte_smartproxy_response(url) mw.process_response(req, res, self.spider) self.assertEqual(slot.delay, delay) self.assertEqual(self.spider.download_delay, delay) - self.assertIn('proxy.zyte.com', dnscache) + self.assertIn("proxy.zyte.com", dnscache) # server failures mw.process_exception(req, ConnectionRefusedError(), self.spider) self.assertEqual(slot.delay, mw.connection_refused_delay) self.assertEqual(self.spider.download_delay, delay) - self.assertNotIn('proxy.zyte.com', dnscache) + self.assertNotIn("proxy.zyte.com", dnscache) - dnscache['proxy.zyte.com'] = '1.1.1.1' + dnscache["proxy.zyte.com"] = "1.1.1.1" res = self._mock_zyte_smartproxy_response(ban_url) mw.process_response(req, res, self.spider) self.assertEqual(slot.delay, delay) self.assertEqual(self.spider.download_delay, delay) - self.assertIn('proxy.zyte.com', dnscache) + self.assertIn("proxy.zyte.com", dnscache) mw.process_exception(req, ConnectionRefusedError(), self.spider) self.assertEqual(slot.delay, mw.connection_refused_delay) self.assertEqual(self.spider.download_delay, delay) - self.assertNotIn('proxy.zyte.com', dnscache) + self.assertNotIn("proxy.zyte.com", dnscache) - dnscache['proxy.zyte.com'] = '1.1.1.1' + dnscache["proxy.zyte.com"] = "1.1.1.1" res = self._mock_zyte_smartproxy_response(ban_url, status=self.bancode) mw.process_response(req, res, self.spider) self.assertEqual(slot.delay, delay) self.assertEqual(self.spider.download_delay, delay) - self.assertIn('proxy.zyte.com', dnscache) + self.assertIn("proxy.zyte.com", dnscache) mw.process_exception(req, ConnectionDone(), self.spider) self.assertEqual(slot.delay, mw.connection_refused_delay) self.assertEqual(self.spider.download_delay, delay) - self.assertNotIn('proxy.zyte.com', dnscache) + self.assertNotIn("proxy.zyte.com", dnscache) def test_process_exception_outside_zyte_smartproxy(self): self.spider.zyte_smartproxy_enabled = False @@ -378,33 +417,33 @@ def test_jobid_header(self): crawler = self._mock_crawler(self.spider, self.settings) mw1 = self.mwcls.from_crawler(crawler) mw1.open_spider(self.spider) - req1 = Request('http://example.com') + req1 = Request("http://example.com") self.assertEqual(mw1.process_request(req1, self.spider), None) - self.assertEqual(req1.headers.get('X-Crawlera-Jobid'), None) - self.assertEqual(req1.headers.get('Zyte-JobId'), None) + self.assertEqual(req1.headers.get("X-Crawlera-Jobid"), None) + self.assertEqual(req1.headers.get("Zyte-JobId"), None) # test with the environment variable 'SCRAPY_JOB' - os.environ['SCRAPY_JOB'] = '2816' + os.environ["SCRAPY_JOB"] = "2816" self.spider.zyte_smartproxy_enabled = True mw2 = self.mwcls.from_crawler(crawler) mw2.open_spider(self.spider) - req2 = Request('http://example.com') + req2 = Request("http://example.com") self.assertEqual(mw2.process_request(req2, self.spider), None) - self.assertEqual(req2.headers.get('X-Crawlera-Jobid'), b'2816') - self.assertEqual(req2.headers.get('Zyte-JobId'), None) + self.assertEqual(req2.headers.get("X-Crawlera-Jobid"), b"2816") + self.assertEqual(req2.headers.get("Zyte-JobId"), None) # Zyte API mw3 = self.mwcls.from_crawler(crawler) mw3.open_spider(self.spider) req3 = Request( - 'http://example.com', + "http://example.com", meta={ "proxy": "http://apikey:@api.zyte.com:8011", }, ) self.assertEqual(mw3.process_request(req3, self.spider), None) - self.assertEqual(req3.headers.get('X-Crawlera-Jobid'), None) - self.assertEqual(req3.headers.get('Zyte-JobId'), b'2816') + self.assertEqual(req3.headers.get("X-Crawlera-Jobid"), None) + self.assertEqual(req3.headers.get("Zyte-JobId"), b"2816") def _test_stats(self, settings, prefix): self.spider.zyte_smartproxy_enabled = True @@ -416,79 +455,101 @@ def _test_stats(self, settings, prefix): mw.open_spider(spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) - req = Request('http://example.com') + req = Request("http://example.com") assert mw.process_request(req, spider) is None assert httpproxy.process_request(req, spider) is None - self.assertEqual(crawler.stats.get_value('{}/request'.format(prefix)), 1) - self.assertEqual(crawler.stats.get_value('{}/request/method/GET'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value("{}/request".format(prefix)), 1) + self.assertEqual( + crawler.stats.get_value("{}/request/method/GET".format(prefix)), 1 + ) res = self._mock_zyte_smartproxy_response(req.url) assert mw.process_response(req, res, spider) is res - self.assertEqual(crawler.stats.get_value('{}/response'.format(prefix)), 1) - self.assertEqual(crawler.stats.get_value('{}/response/status/200'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value("{}/response".format(prefix)), 1) + self.assertEqual( + crawler.stats.get_value("{}/response/status/200".format(prefix)), 1 + ) - req = Request('http://example.com/other', method='POST') + req = Request("http://example.com/other", method="POST") assert mw.process_request(req, spider) is None assert httpproxy.process_request(req, spider) is None - self.assertEqual(crawler.stats.get_value('{}/request'.format(prefix)), 2) - self.assertEqual(crawler.stats.get_value('{}/request/method/POST'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value("{}/request".format(prefix)), 2) + self.assertEqual( + crawler.stats.get_value("{}/request/method/POST".format(prefix)), 1 + ) res = self._mock_zyte_smartproxy_response( - req.url, - status=mw.ban_code, - headers={'Zyte-Error': 'somethingbad'} + req.url, status=mw.ban_code, headers={"Zyte-Error": "somethingbad"} ) assert mw.process_response(req, res, spider) is res - self.assertEqual(crawler.stats.get_value('{}/response'.format(prefix)), 2) - self.assertEqual(crawler.stats.get_value('{}/response/status/{}'.format(prefix, mw.ban_code)), 1) - self.assertEqual(crawler.stats.get_value('{}/response/error'.format(prefix)), 1) - self.assertEqual(crawler.stats.get_value('{}/response/error/somethingbad'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value("{}/response".format(prefix)), 2) + self.assertEqual( + crawler.stats.get_value( + "{}/response/status/{}".format(prefix, mw.ban_code) + ), + 1, + ) + self.assertEqual(crawler.stats.get_value("{}/response/error".format(prefix)), 1) + self.assertEqual( + crawler.stats.get_value("{}/response/error/somethingbad".format(prefix)), 1 + ) self.assertEqual(res.headers["X-Crawlera-Error"], b"somethingbad") self.assertEqual(res.headers["Zyte-Error"], b"somethingbad") res = self._mock_zyte_smartproxy_response( req.url, status=mw.ban_code, - headers={'X-Crawlera-Error': 'banned', "Retry-After": "1"} + headers={"X-Crawlera-Error": "banned", "Retry-After": "1"}, ) assert mw.process_response(req, res, spider) is res - self.assertEqual(crawler.stats.get_value('{}/response'.format(prefix)), 3) - self.assertEqual(crawler.stats.get_value('{}/response/status/{}'.format(prefix, mw.ban_code)), 2) - self.assertEqual(crawler.stats.get_value('{}/response/banned'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value("{}/response".format(prefix)), 3) + self.assertEqual( + crawler.stats.get_value( + "{}/response/status/{}".format(prefix, mw.ban_code) + ), + 2, + ) + self.assertEqual( + crawler.stats.get_value("{}/response/banned".format(prefix)), 1 + ) self.assertEqual(res.headers["X-Crawlera-Error"], b"banned") self.assertEqual(res.headers["Zyte-Error"], b"banned") res = self._mock_zyte_smartproxy_response( req.url, status=mw.ban_code, - headers={'X-Crawlera-Error': 'banned', "Retry-After": "1"} + headers={"X-Crawlera-Error": "banned", "Retry-After": "1"}, ) slot_key = "example.com" crawler.engine.downloader.slots[slot_key] = MockedSlot() req.meta["download_slot"] = "example.com" assert mw.process_response(req, res, spider) is res del req.meta["download_slot"] - self.assertEqual(crawler.stats.get_value('{}/delay/banned'.format(prefix)), 1) - self.assertEqual(crawler.stats.get_value('{}/delay/banned/total'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value("{}/delay/banned".format(prefix)), 1) + self.assertEqual( + crawler.stats.get_value("{}/delay/banned/total".format(prefix)), 1 + ) res = self._mock_zyte_smartproxy_response( req.url, status=407, - headers={'X-Crawlera-Error': 'bad_proxy_auth'}, + headers={"X-Crawlera-Error": "bad_proxy_auth"}, ) assert isinstance(mw.process_response(req, res, spider), Request) - self.assertEqual(crawler.stats.get_value('{}/retries/auth'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value("{}/retries/auth".format(prefix)), 1) res = self._mock_zyte_smartproxy_response( req.url, status=407, - headers={'X-Crawlera-Error': 'bad_proxy_auth'}, + headers={"X-Crawlera-Error": "bad_proxy_auth"}, ) req.meta["zyte_smartproxy_auth_retry_times"] = 11 assert mw.process_response(req, res, spider) is res del req.meta["zyte_smartproxy_auth_retry_times"] - self.assertEqual(crawler.stats.get_value('{}/retries/auth'.format(prefix)), 1) - self.assertEqual(crawler.stats.get_value('{}/retries/auth/max_reached'.format(prefix)), 1) + self.assertEqual(crawler.stats.get_value("{}/retries/auth".format(prefix)), 1) + self.assertEqual( + crawler.stats.get_value("{}/retries/auth/max_reached".format(prefix)), 1 + ) res = self._mock_zyte_smartproxy_response( req.url, @@ -497,7 +558,12 @@ def _test_stats(self, settings, prefix): req.meta["dont_proxy"] = True assert isinstance(mw.process_response(req, res, spider), Request) del req.meta["dont_proxy"] - self.assertEqual(crawler.stats.get_value('{}/retries/should_have_been_enabled'.format(prefix)), 1) + self.assertEqual( + crawler.stats.get_value( + "{}/retries/should_have_been_enabled".format(prefix) + ), + 1, + ) def test_stats_spm(self): self._test_stats(self.settings, "zyte_smartproxy") @@ -514,16 +580,16 @@ def _make_fake_request(self, spider, zyte_smartproxy_enabled, **kwargs): mw.open_spider(spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) headers = { - 'X-Crawlera-Debug': True, - 'X-Crawlera-Foo': "foo", - 'X-Crawlera-Profile': 'desktop', - 'User-Agent': 'Scrapy', - '': None, - 'Zyte-Bar': "bar", - 'Zyte-BrowserHtml': True, - 'Zyte-Geolocation': 'foo', + "X-Crawlera-Debug": True, + "X-Crawlera-Foo": "foo", + "X-Crawlera-Profile": "desktop", + "User-Agent": "Scrapy", + "": None, + "Zyte-Bar": "bar", + "Zyte-BrowserHtml": True, + "Zyte-Geolocation": "foo", } - req = Request('http://example.com', headers=headers, **kwargs) + req = Request("http://example.com", headers=headers, **kwargs) mw.process_request(req, spider) httpproxy.process_request(req, spider) return req @@ -531,92 +597,95 @@ def _make_fake_request(self, spider, zyte_smartproxy_enabled, **kwargs): def test_clean_headers_when_disabled(self): req = self._make_fake_request(self.spider, zyte_smartproxy_enabled=False) - self.assertNotIn(b'X-Crawlera-Debug', req.headers) - self.assertNotIn(b'X-Crawlera-Foo', req.headers) - self.assertNotIn(b'X-Crawlera-Profile', req.headers) - self.assertNotIn(b'Zyte-Bar', req.headers) - self.assertNotIn(b'Zyte-BrowserHtml', req.headers) - self.assertNotIn(b'Zyte-Geolocation', req.headers) - self.assertIn(b'User-Agent', req.headers) + self.assertNotIn(b"X-Crawlera-Debug", req.headers) + self.assertNotIn(b"X-Crawlera-Foo", req.headers) + self.assertNotIn(b"X-Crawlera-Profile", req.headers) + self.assertNotIn(b"Zyte-Bar", req.headers) + self.assertNotIn(b"Zyte-BrowserHtml", req.headers) + self.assertNotIn(b"Zyte-Geolocation", req.headers) + self.assertIn(b"User-Agent", req.headers) def test_clean_headers_when_enabled_spm(self): req = self._make_fake_request(self.spider, zyte_smartproxy_enabled=True) - self.assertEqual(req.headers[b'X-Crawlera-Debug'], b'True') - self.assertEqual(req.headers[b'X-Crawlera-Foo'], b'foo') - self.assertEqual(req.headers[b'X-Crawlera-Profile'], b'desktop') - self.assertNotIn(b'Zyte-Bar', req.headers) - self.assertNotIn(b'Zyte-BrowserHtml', req.headers) - self.assertNotIn(b'Zyte-Geolocation', req.headers) - self.assertEqual(req.headers[b'X-Crawlera-Region'], b'foo') - self.assertIn(b'User-Agent', req.headers) + self.assertEqual(req.headers[b"X-Crawlera-Debug"], b"True") + self.assertEqual(req.headers[b"X-Crawlera-Foo"], b"foo") + self.assertEqual(req.headers[b"X-Crawlera-Profile"], b"desktop") + self.assertNotIn(b"Zyte-Bar", req.headers) + self.assertNotIn(b"Zyte-BrowserHtml", req.headers) + self.assertNotIn(b"Zyte-Geolocation", req.headers) + self.assertEqual(req.headers[b"X-Crawlera-Region"], b"foo") + self.assertIn(b"User-Agent", req.headers) def test_clean_headers_when_enabled_zyte_api(self): meta = {"proxy": "http://apikey:@api.zyte.com:8011"} - req = self._make_fake_request(self.spider, zyte_smartproxy_enabled=True, meta=meta) - self.assertNotIn(b'X-Crawlera-Debug', req.headers) - self.assertNotIn(b'X-Crawlera-Foo', req.headers) - self.assertNotIn(b'X-Crawlera-Profile', req.headers) - self.assertEqual(req.headers[b'Zyte-Bar'], b'bar') - self.assertEqual(req.headers[b'Zyte-BrowserHtml'], b'True') - self.assertEqual(req.headers[b'Zyte-Device'], b'desktop') - self.assertEqual(req.headers[b'Zyte-Geolocation'], b'foo') - self.assertIn(b'User-Agent', req.headers) + req = self._make_fake_request( + self.spider, zyte_smartproxy_enabled=True, meta=meta + ) + self.assertNotIn(b"X-Crawlera-Debug", req.headers) + self.assertNotIn(b"X-Crawlera-Foo", req.headers) + self.assertNotIn(b"X-Crawlera-Profile", req.headers) + self.assertEqual(req.headers[b"Zyte-Bar"], b"bar") + self.assertEqual(req.headers[b"Zyte-BrowserHtml"], b"True") + self.assertEqual(req.headers[b"Zyte-Device"], b"desktop") + self.assertEqual(req.headers[b"Zyte-Geolocation"], b"foo") + self.assertIn(b"User-Agent", req.headers) def test_zyte_smartproxy_default_headers(self): spider = self.spider self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_DEFAULT_HEADERS'] = { - 'X-Crawlera-Profile': 'desktop', + self.settings["ZYTE_SMARTPROXY_DEFAULT_HEADERS"] = { + "X-Crawlera-Profile": "desktop", } crawler = self._mock_crawler(spider, self.settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) - req = Request('http://example.com/other') + req = Request("http://example.com/other") assert mw.process_request(req, spider) is None - self.assertEqual(req.headers['X-Crawlera-Profile'], b'desktop') - self.assertNotIn('Zyte-Device', req.headers) + self.assertEqual(req.headers["X-Crawlera-Profile"], b"desktop") + self.assertNotIn("Zyte-Device", req.headers) # Header translation req = Request( - 'http://example.com/other', + "http://example.com/other", meta={"proxy": "http://apikey:@api.zyte.com:8011"}, ) assert mw.process_request(req, spider) is None - self.assertNotIn('X-Crawlera-Profile', req.headers) - self.assertEqual(req.headers['Zyte-Device'], b'desktop') + self.assertNotIn("X-Crawlera-Profile", req.headers) + self.assertEqual(req.headers["Zyte-Device"], b"desktop") # test ignore None headers - self.settings['ZYTE_SMARTPROXY_DEFAULT_HEADERS'] = { - 'X-Crawlera-Profile': None, - 'X-Crawlera-Cookies': 'disable', + self.settings["ZYTE_SMARTPROXY_DEFAULT_HEADERS"] = { + "X-Crawlera-Profile": None, + "X-Crawlera-Cookies": "disable", } crawler = self._mock_crawler(spider, self.settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) - req = Request('http://example.com/other') + req = Request("http://example.com/other") assert mw.process_request(req, spider) is None - self.assertEqual(req.headers['X-Crawlera-Cookies'], b'disable') - self.assertNotIn('X-Crawlera-Profile', req.headers) - - @patch('scrapy_zyte_smartproxy.middleware.warnings') - @patch('scrapy_zyte_smartproxy.middleware.logger') - def test_zyte_smartproxy_default_headers_conflicting_headers(self, mock_logger, mock_warnings): + self.assertEqual(req.headers["X-Crawlera-Cookies"], b"disable") + self.assertNotIn("X-Crawlera-Profile", req.headers) + + @patch("scrapy_zyte_smartproxy.middleware.warnings") + @patch("scrapy_zyte_smartproxy.middleware.logger") + def test_zyte_smartproxy_default_headers_conflicting_headers( + self, mock_logger, mock_warnings + ): spider = self.spider self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_DEFAULT_HEADERS'] = { - 'X-Crawlera-Profile': 'desktop', + self.settings["ZYTE_SMARTPROXY_DEFAULT_HEADERS"] = { + "X-Crawlera-Profile": "desktop", } crawler = self._mock_crawler(spider, self.settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) - req = Request('http://example.com/other', - headers={'X-Crawlera-UA': 'desktop'}) + req = Request("http://example.com/other", headers={"X-Crawlera-UA": "desktop"}) assert mw.process_request(req, spider) is None - self.assertEqual(req.headers['X-Crawlera-UA'], b'desktop') - self.assertEqual(req.headers['X-Crawlera-Profile'], b'desktop') + self.assertEqual(req.headers["X-Crawlera-UA"], b"desktop") + self.assertEqual(req.headers["X-Crawlera-Profile"], b"desktop") some_requests_warning = ( "The headers ('X-Crawlera-Profile', 'X-Crawlera-UA') are " "conflicting on some of your requests. Please check " @@ -633,20 +702,17 @@ def test_zyte_smartproxy_default_headers_conflicting_headers(self, mock_logger, "for more information" ) mock_logger.debug.assert_called_with( - other_request_warning, - extra={'spider': spider} + other_request_warning, extra={"spider": spider} ) # test it ignores case - req = Request('http://example.com/other', - headers={'x-crawlera-ua': 'desktop'}) + req = Request("http://example.com/other", headers={"x-crawlera-ua": "desktop"}) assert mw.process_request(req, spider) is None - self.assertEqual(req.headers['X-Crawlera-UA'], b'desktop') - self.assertEqual(req.headers['X-Crawlera-Profile'], b'desktop') + self.assertEqual(req.headers["X-Crawlera-UA"], b"desktop") + self.assertEqual(req.headers["X-Crawlera-Profile"], b"desktop") mock_warnings.warn.assert_called_with(some_requests_warning) mock_logger.debug.assert_called_with( - other_request_warning, - extra={'spider': spider} + other_request_warning, extra={"spider": spider} ) def test_dont_proxy_false_does_nothing(self): @@ -655,10 +721,10 @@ def test_dont_proxy_false_does_nothing(self): crawler = self._mock_crawler(spider, self.settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(spider) - req = Request('http://example.com/other') - req.meta['dont_proxy'] = False + req = Request("http://example.com/other") + req.meta["dont_proxy"] = False assert mw.process_request(req, spider) is None - self.assertIsNotNone(req.meta.get('proxy')) + self.assertIsNotNone(req.meta.get("proxy")) def test_is_banned(self): self.spider.zyte_smartproxy_enabled = True @@ -670,37 +736,43 @@ def test_is_banned(self): res = Response(req.url, status=200) res = mw.process_response(req, res, self.spider) self.assertFalse(mw._is_banned(res)) - res = Response(req.url, status=503, headers={'X-Crawlera-Error': 'noslaves'}) + res = Response(req.url, status=503, headers={"X-Crawlera-Error": "noslaves"}) res = mw.process_response(req, res, self.spider) self.assertFalse(mw._is_banned(res)) - res = Response(req.url, status=503, headers={'Zyte-Error': '/limits/over-global-limit'}) + res = Response( + req.url, status=503, headers={"Zyte-Error": "/limits/over-global-limit"} + ) res = mw.process_response(req, res, self.spider) self.assertFalse(mw._is_banned(res)) - res = Response(req.url, status=503, headers={'X-Crawlera-Error': 'banned'}) + res = Response(req.url, status=503, headers={"X-Crawlera-Error": "banned"}) res = mw.process_response(req, res, self.spider) self.assertTrue(mw._is_banned(res)) - res = Response(req.url, status=520, headers={'Zyte-Error': '/download/temporary-error'}) + res = Response( + req.url, status=520, headers={"Zyte-Error": "/download/temporary-error"} + ) res = mw.process_response(req, res, self.spider) self.assertTrue(mw._is_banned(res)) - res = Response(req.url, status=521, headers={'Zyte-Error': '/download/internal-error'}) + res = Response( + req.url, status=521, headers={"Zyte-Error": "/download/internal-error"} + ) res = mw.process_response(req, res, self.spider) self.assertTrue(mw._is_banned(res)) - @patch('random.uniform') + @patch("random.uniform") def test_noslaves_delays(self, random_uniform_patch): # mock random.uniform to just return the max delay random_uniform_patch.side_effect = lambda x, y: y - slot_key = 'example.com' - url = 'http://example.com' - ban_url = 'http://banned.example' + slot_key = "example.com" + url = "http://example.com" + ban_url = "http://banned.example" max_delay = 70 backoff_step = 15 default_delay = 0 - self.settings['ZYTE_SMARTPROXY_BACKOFF_STEP'] = backoff_step - self.settings['ZYTE_SMARTPROXY_BACKOFF_MAX'] = max_delay + self.settings["ZYTE_SMARTPROXY_BACKOFF_STEP"] = backoff_step + self.settings["ZYTE_SMARTPROXY_BACKOFF_MAX"] = max_delay self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) @@ -711,7 +783,7 @@ def test_noslaves_delays(self, random_uniform_patch): slot = MockedSlot() crawler.engine.downloader.slots[slot_key] = slot - noslaves_req = Request(url, meta={'download_slot': slot_key}) + noslaves_req = Request(url, meta={"download_slot": slot_key}) assert mw.process_request(noslaves_req, self.spider) is None assert httpproxy.process_request(noslaves_req, self.spider) is None @@ -719,7 +791,7 @@ def test_noslaves_delays(self, random_uniform_patch): noslaves_response = self._mock_zyte_smartproxy_response( ban_url, status=503, - headers={'X-Crawlera-Error': 'noslaves'}, + headers={"X-Crawlera-Error": "noslaves"}, ) mw.process_response(noslaves_req, noslaves_response, self.spider) self.assertEqual(slot.delay, backoff_step) @@ -727,32 +799,32 @@ def test_noslaves_delays(self, random_uniform_patch): over_use_limit_response = self._mock_zyte_smartproxy_response( ban_url, status=429, - headers={'Zyte-Error': '/limits/over-user-limit'}, + headers={"Zyte-Error": "/limits/over-user-limit"}, ) mw.process_response(noslaves_req, over_use_limit_response, self.spider) - self.assertEqual(slot.delay, backoff_step * 2 ** 1) + self.assertEqual(slot.delay, backoff_step * 2**1) over_domain_limit_response = self._mock_zyte_smartproxy_response( ban_url, status=429, - headers={'Zyte-Error': '/limits/over-domain-limit'}, + headers={"Zyte-Error": "/limits/over-domain-limit"}, ) mw.process_response(noslaves_req, over_domain_limit_response, self.spider) - self.assertEqual(slot.delay, backoff_step * 2 ** 2) + self.assertEqual(slot.delay, backoff_step * 2**2) over_global_limit_response = self._mock_zyte_smartproxy_response( ban_url, status=503, - headers={'Zyte-Error': '/limits/over-global-limit'}, + headers={"Zyte-Error": "/limits/over-global-limit"}, ) mw.process_response(noslaves_req, over_global_limit_response, self.spider) self.assertEqual(slot.delay, max_delay) # other responses reset delay - ban_req = Request(url, meta={'download_slot': slot_key}) + ban_req = Request(url, meta={"download_slot": slot_key}) assert mw.process_request(ban_req, self.spider) is None assert httpproxy.process_request(ban_req, self.spider) is None - ban_headers = {'X-Crawlera-Error': 'banned'} + ban_headers = {"X-Crawlera-Error": "banned"} ban_res = self._mock_zyte_smartproxy_response( ban_url, status=self.bancode, @@ -764,7 +836,7 @@ def test_noslaves_delays(self, random_uniform_patch): mw.process_response(noslaves_req, noslaves_response, self.spider) self.assertEqual(slot.delay, backoff_step) - good_req = Request(url, meta={'download_slot': slot_key}) + good_req = Request(url, meta={"download_slot": slot_key}) assert mw.process_request(good_req, self.spider) is None assert httpproxy.process_request(good_req, self.spider) is None good_res = self._mock_zyte_smartproxy_response( @@ -774,20 +846,19 @@ def test_noslaves_delays(self, random_uniform_patch): mw.process_response(good_req, good_res, self.spider) self.assertEqual(slot.delay, default_delay) - @patch('random.uniform') + @patch("random.uniform") def test_auth_error_retries(self, random_uniform_patch): # mock random.uniform to just return the max delay random_uniform_patch.side_effect = lambda x, y: y - slot_key = 'example.com' - url = 'http://example.com' - ban_url = 'http://auth.error' + slot_key = "example.com" + url = "http://example.com" + ban_url = "http://auth.error" max_delay = 70 backoff_step = 15 - default_delay = 0 - self.settings['ZYTE_SMARTPROXY_BACKOFF_STEP'] = backoff_step - self.settings['ZYTE_SMARTPROXY_BACKOFF_MAX'] = max_delay + self.settings["ZYTE_SMARTPROXY_BACKOFF_STEP"] = backoff_step + self.settings["ZYTE_SMARTPROXY_BACKOFF_MAX"] = max_delay self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) @@ -799,14 +870,12 @@ def test_auth_error_retries(self, random_uniform_patch): slot = MockedSlot() crawler.engine.downloader.slots[slot_key] = slot - auth_error_req = Request(url, meta={'download_slot': slot_key}) + auth_error_req = Request(url, meta={"download_slot": slot_key}) assert mw.process_request(auth_error_req, self.spider) is None assert httpproxy.process_request(auth_error_req, self.spider) is None - auth_error_headers = {'X-Crawlera-Error': 'bad_proxy_auth'} + auth_error_headers = {"X-Crawlera-Error": "bad_proxy_auth"} auth_error_response = self._mock_zyte_smartproxy_response( - ban_url, - status=self.auth_error_code, - headers=auth_error_headers + ban_url, status=self.auth_error_code, headers=auth_error_headers ) # delays grow exponentially, retry times increase accordingly @@ -817,13 +886,13 @@ def test_auth_error_retries(self, random_uniform_patch): auth_error_req.meta["zyte_smartproxy_auth_retry_times"] = retry_times req = mw.process_response(auth_error_req, auth_error_response, self.spider) - self.assertEqual(slot.delay, backoff_step * 2 ** 1) + self.assertEqual(slot.delay, backoff_step * 2**1) retry_times = req.meta["zyte_smartproxy_auth_retry_times"] self.assertEqual(retry_times, 2) auth_error_req.meta["zyte_smartproxy_auth_retry_times"] = retry_times req = mw.process_response(auth_error_req, auth_error_response, self.spider) - self.assertEqual(slot.delay, backoff_step * 2 ** 2) + self.assertEqual(slot.delay, backoff_step * 2**2) retry_times = req.meta["zyte_smartproxy_auth_retry_times"] self.assertEqual(retry_times, 3) @@ -844,10 +913,12 @@ def test_auth_error_retries(self, random_uniform_patch): ban_url, status=self.auth_error_code, ) - res = mw.process_response(auth_error_req, non_zyte_smartproxy_407_response, self.spider) + res = mw.process_response( + auth_error_req, non_zyte_smartproxy_407_response, self.spider + ) self.assertIsInstance(res, Response) - @patch('scrapy_zyte_smartproxy.middleware.logger') + @patch("scrapy_zyte_smartproxy.middleware.logger") def test_open_spider_logging(self, mock_logger): spider = self.spider self.spider.zyte_smartproxy_enabled = True @@ -856,10 +927,9 @@ def test_open_spider_logging(self, mock_logger): mw.open_spider(spider) expected_calls = [ call( - "Using Zyte proxy service %s with an API key ending in %s" % ( - self.mwcls.url, 'apikey' - ), - extra={'spider': spider}, + "Using Zyte proxy service %s with an API key ending in %s" + % (self.mwcls.url, "apikey"), + extra={"spider": spider}, ), call( "ZyteSmartProxyMiddleware: disabling download delays in " @@ -867,7 +937,7 @@ def test_open_spider_logging(self, mock_logger): "To avoid this behaviour you can use the " "ZYTE_SMARTPROXY_PRESERVE_DELAY setting, but keep in mind " "that this may slow down the crawl significantly", - extra={'spider': spider}, + extra={"spider": spider}, ), ] assert mock_logger.info.call_args_list == expected_calls @@ -876,7 +946,7 @@ def test_process_response_enables_zyte_smartproxy(self): url = "https://scrapy.org" self.spider.zyte_smartproxy_enabled = False - self.settings['ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES'] = [403] + self.settings["ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES"] = [403] crawler = self._mock_crawler(self.spider, self.settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(self.spider) @@ -898,9 +968,12 @@ def test_process_response_enables_zyte_smartproxy(self): self.assertIsInstance(out, Request) self.assertEqual(mw.enabled, False) self.assertEqual(mw.enabled_for_domain["scrapy.org"], True) - self.assertEqual(mw.crawler.stats.get_stats(), { - 'zyte_smartproxy/retries/should_have_been_enabled': 1, - }) + self.assertEqual( + mw.crawler.stats.get_stats(), + { + "zyte_smartproxy/retries/should_have_been_enabled": 1, + }, + ) # Another regular response with bad code should be done on Zyte Smart # Proxy Manager and not be retried @@ -926,7 +999,7 @@ def test_process_response_from_file_scheme(self): url = "file:///tmp/foobar.txt" self.spider.zyte_smartproxy_enabled = False - self.settings['ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES'] = [403] + self.settings["ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES"] = [403] crawler = self._mock_crawler(self.spider, self.settings) mw = self.mwcls.from_crawler(crawler) mw.enabled_for_domain = {} @@ -943,7 +1016,7 @@ def test_process_response_from_file_scheme(self): self.assertEqual(mw.crawler.stats.get_stats(), {}) self.assertEqual(out.status, 200) - @patch('scrapy_zyte_smartproxy.middleware.logger') + @patch("scrapy_zyte_smartproxy.middleware.logger") def test_apikey_warning_zyte_smartproxy_disabled(self, mock_logger): self.spider.zyte_smartproxy_enabled = False settings = {} @@ -953,7 +1026,7 @@ def test_apikey_warning_zyte_smartproxy_disabled(self, mock_logger): self.assertFalse(mw.enabled) mock_logger.warning.assert_not_called() - @patch('scrapy_zyte_smartproxy.middleware.logger') + @patch("scrapy_zyte_smartproxy.middleware.logger") def test_no_apikey_warning_zyte_smartproxy_enabled(self, mock_logger): self.spider.zyte_smartproxy_enabled = True settings = {} @@ -963,28 +1036,28 @@ def test_no_apikey_warning_zyte_smartproxy_enabled(self, mock_logger): self.assertTrue(mw.enabled) mock_logger.warning.assert_called_with( "Zyte proxy services cannot be used without an API key", - extra={'spider': self.spider} + extra={"spider": self.spider}, ) - @patch('scrapy_zyte_smartproxy.middleware.logger') + @patch("scrapy_zyte_smartproxy.middleware.logger") def test_no_apikey_warning_force_enable(self, mock_logger): self.spider.zyte_smartproxy_enabled = False - settings = {'ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES': [403]} + settings = {"ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES": [403]} crawler = self._mock_crawler(self.spider, settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(self.spider) self.assertFalse(mw.enabled) mock_logger.warning.assert_called_with( "Zyte proxy services cannot be used without an API key", - extra={'spider': self.spider} + extra={"spider": self.spider}, ) - @patch('scrapy_zyte_smartproxy.middleware.logger') + @patch("scrapy_zyte_smartproxy.middleware.logger") def test_apikey_warning_force_enable(self, mock_logger): self.spider.zyte_smartproxy_enabled = False settings = { - 'ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES': [403], - 'ZYTE_SMARTPROXY_APIKEY': 'apikey' + "ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES": [403], + "ZYTE_SMARTPROXY_APIKEY": "apikey", } crawler = self._mock_crawler(self.spider, settings) mw = self.mwcls.from_crawler(crawler) @@ -992,24 +1065,20 @@ def test_apikey_warning_force_enable(self, mock_logger): self.assertFalse(mw.enabled) mock_logger.warning.assert_not_called() - def test_is_enabled_warnings(self): self._assert_disabled(self.spider, self.settings) - self.settings['HUBPROXY_ENABLED'] = True + self.settings["HUBPROXY_ENABLED"] = True with pytest.warns(ScrapyDeprecationWarning) as record: self._assert_enabled(self.spider, self.settings) assert len(record) == 1 - assert 'HUBPROXY_ENABLED setting is deprecated' in \ - str(record[0].message) + assert "HUBPROXY_ENABLED setting is deprecated" in str(record[0].message) - del self.settings['HUBPROXY_ENABLED'] + del self.settings["HUBPROXY_ENABLED"] self.spider.use_hubproxy = False with pytest.warns(ScrapyDeprecationWarning) as record: self._assert_disabled(self.spider, self.settings) assert len(record) == 1 - assert 'use_hubproxy attribute is deprecated' in \ - str(record[0].message) - + assert "use_hubproxy attribute is deprecated" in str(record[0].message) def test_settings_warnings(self): self.spider.hubproxy_maxbans = 10 @@ -1018,23 +1087,22 @@ def test_settings_warnings(self): with pytest.warns(ScrapyDeprecationWarning) as record: mw.open_spider(self.spider) assert len(record) == 1 - assert 'hubproxy_maxbans attribute is deprecated' in \ - str(record[0].message) + assert "hubproxy_maxbans attribute is deprecated" in str(record[0].message) del self.spider.hubproxy_maxbans - self.settings['HUBPROXY_BACKOFF_MAX'] = 10 + self.settings["HUBPROXY_BACKOFF_MAX"] = 10 crawler = self._mock_crawler(self.spider, self.settings) mw = self.mwcls.from_crawler(crawler) with pytest.warns(ScrapyDeprecationWarning) as record: mw.open_spider(self.spider) assert len(record) == 1 - assert 'HUBPROXY_BACKOFF_MAX setting is deprecated' in \ - str(record[0].message) - + assert "HUBPROXY_BACKOFF_MAX setting is deprecated" in str( + record[0].message + ) def test_no_slot(self): - url = 'http://example.com' - ban_url = 'http://banned.example' + url = "http://example.com" + ban_url = "http://banned.example" self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) @@ -1042,11 +1110,10 @@ def test_no_slot(self): mw.open_spider(self.spider) # there are no slot named 'example.com' - noslaves_req = Request(url, - meta={'download_slot': 'example.com'}) + noslaves_req = Request(url, meta={"download_slot": "example.com"}) assert mw.process_request(noslaves_req, self.spider) is None - headers = {'X-Crawlera-Error': 'noslaves'} + headers = {"X-Crawlera-Error": "noslaves"} noslaves_res = self._mock_zyte_smartproxy_response( ban_url, status=self.bancode, @@ -1056,45 +1123,42 @@ def test_no_slot(self): response = mw.process_response(noslaves_req, noslaves_res, self.spider) assert response.status == 503 - def test_settings_dict(self): self.spider.zyte_smartproxy_enabled = True - self.settings['ZYTE_SMARTPROXY_DEFAULT_HEADERS'] = { - 'X-Crawlera-Profile': 'desktop', + self.settings["ZYTE_SMARTPROXY_DEFAULT_HEADERS"] = { + "X-Crawlera-Profile": "desktop", } crawler = self._mock_crawler(self.spider, self.settings) mw = self.mwcls.from_crawler(crawler) # we don't have a dict settings yet, have to mess with protected # property - mw._settings.append( - ('default_headers', dict) - ) + mw._settings.append(("default_headers", dict)) mw.open_spider(self.spider) - req = Request('http://example.com/other') + req = Request("http://example.com/other") mw.process_request(req, self.spider) assert mw.process_request(req, self.spider) is None - self.assertEqual(req.headers['X-Crawlera-Profile'], b'desktop') + self.assertEqual(req.headers["X-Crawlera-Profile"], b"desktop") def test_client_header(self): self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) mw = self.mwcls.from_crawler(crawler) mw.open_spider(self.spider) - req1 = Request('http://example.com') + req1 = Request("http://example.com") self.assertEqual(mw.process_request(req1, self.spider), None) - client = 'scrapy-zyte-smartproxy/{}'.format(__version__).encode() - self.assertEqual(req1.headers.get('X-Crawlera-Client'), client) - self.assertEqual(req1.headers.get('Zyte-Client'), None) + client = "scrapy-zyte-smartproxy/{}".format(__version__).encode() + self.assertEqual(req1.headers.get("X-Crawlera-Client"), client) + self.assertEqual(req1.headers.get("Zyte-Client"), None) req2 = Request( - 'http://example.com', + "http://example.com", meta={ "proxy": "http://apikey:@api.zyte.com:8011", }, ) self.assertEqual(mw.process_request(req2, self.spider), None) - self.assertEqual(req2.headers.get('X-Crawlera-Client'), None) - self.assertEqual(req2.headers.get('Zyte-Client'), client) + self.assertEqual(req2.headers.get("X-Crawlera-Client"), None) + self.assertEqual(req2.headers.get("Zyte-Client"), client) def test_scrapy_httpproxy_integration(self): self.spider.zyte_smartproxy_enabled = True @@ -1102,26 +1166,26 @@ def test_scrapy_httpproxy_integration(self): smartproxy = self.mwcls.from_crawler(crawler) smartproxy.open_spider(self.spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) - request = Request('https://example.com') - auth_header = basic_auth_header('apikey', '') + request = Request("https://example.com") + auth_header = basic_auth_header("apikey", "") # 1st pass self.assertEqual(smartproxy.process_request(request, self.spider), None) self.assertEqual(httpproxy.process_request(request, self.spider), None) - self.assertEqual(request.meta['proxy'], 'http://proxy.zyte.com:8011') - self.assertEqual(request.headers[b'Proxy-Authorization'], auth_header) + self.assertEqual(request.meta["proxy"], "http://proxy.zyte.com:8011") + self.assertEqual(request.headers[b"Proxy-Authorization"], auth_header) # 2nd pass (e.g. retry or redirect) self.assertEqual(smartproxy.process_request(request, self.spider), None) self.assertEqual(httpproxy.process_request(request, self.spider), None) - self.assertEqual(request.meta['proxy'], 'http://proxy.zyte.com:8011') - self.assertEqual(request.headers[b'Proxy-Authorization'], auth_header) + self.assertEqual(request.meta["proxy"], "http://proxy.zyte.com:8011") + self.assertEqual(request.headers[b"Proxy-Authorization"], auth_header) def test_subclass_non_basic_header(self): class Subclass(self.mwcls): def get_proxyauth(self, spider): - return b'Non-Basic foo' + return b"Non-Basic foo" self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) @@ -1133,7 +1197,7 @@ def test_subclass_basic_header_non_base64(self): class Subclass(self.mwcls): def get_proxyauth(self, spider): - return b'Basic foo' + return b"Basic foo" self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) @@ -1145,7 +1209,7 @@ def test_subclass_basic_header_nonurlsafe_base64(self): class Subclass(self.mwcls): def get_proxyauth(self, spider): - return b'Basic YWF+Og==' + return b"Basic YWF+Og==" self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) @@ -1157,7 +1221,7 @@ def test_subclass_basic_header_urlsafe_base64(self): class Subclass(self.mwcls): def get_proxyauth(self, spider): - return b'Basic YWF-Og==' + return b"Basic YWF-Og==" self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) @@ -1187,7 +1251,9 @@ def test_header_translation(self): self.assertNotIn(header, request.headers) self.assertEqual(request.headers[translation], value) - spm_to_zyte_api_translations = {v: k for k, v in zyte_api_to_spm_translations.items()} + spm_to_zyte_api_translations = { + v: k for k, v in zyte_api_to_spm_translations.items() + } for header, translation in spm_to_zyte_api_translations.items(): request = Request( "https://example.com", @@ -1198,7 +1264,7 @@ def test_header_translation(self): self.assertNotIn(header, request.headers) self.assertEqual(request.headers[translation], value) - @patch('scrapy_zyte_smartproxy.middleware.logger') + @patch("scrapy_zyte_smartproxy.middleware.logger") def test_header_drop_warnings(self, mock_logger): self.spider.zyte_smartproxy_enabled = True crawler = self._mock_crawler(self.spider, self.settings) @@ -1245,7 +1311,7 @@ def test_header_drop_warnings(self, mock_logger): "request is proxied with %s and not with %s, and " "automatic translation is not supported for this " "header. See " - "https://docs.zyte.com/zyte-api/migration/zyte/smartproxy.html#parameter-mapping" + "https://docs.zyte.com/zyte-api/migration/zyte/smartproxy.html#parameter-mapping" # noqa " to learn the right way to translate this header " "manually." ), @@ -1269,7 +1335,7 @@ def test_header_drop_warnings(self, mock_logger): "request is proxied with %s and not with %s, and " "automatic translation is not supported for this " "header. See " - "https://docs.zyte.com/zyte-api/migration/zyte/smartproxy.html#parameter-mapping" + "https://docs.zyte.com/zyte-api/migration/zyte/smartproxy.html#parameter-mapping" # noqa " to learn the right way to translate this header " "manually." ), @@ -1299,21 +1365,20 @@ def test_header_based_handling(self): mw.open_spider(spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) - req = Request('http://example.com') + req = Request("http://example.com") assert mw.process_request(req, spider) is None assert httpproxy.process_request(req, spider) is None count = 0 res = Response(req.url) assert mw.process_response(req, res, spider) is res - self.assertEqual(crawler.stats.get_value('zyte_smartproxy/response'), None) + self.assertEqual(crawler.stats.get_value("zyte_smartproxy/response"), None) for k, v in RESPONSE_IDENTIFYING_HEADERS: count += 1 res = Response(req.url, headers={k: v}) assert mw.process_response(req, res, spider) is res - self.assertEqual(crawler.stats.get_value('zyte_smartproxy/response'), count) - + self.assertEqual(crawler.stats.get_value("zyte_smartproxy/response"), count) def test_meta_copy(self): """Warn when users copy the proxy key from one response to the next.""" @@ -1322,20 +1387,20 @@ def test_meta_copy(self): smartproxy = self.mwcls.from_crawler(crawler) smartproxy.open_spider(self.spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) - auth_header = basic_auth_header('apikey', '') + auth_header = basic_auth_header("apikey", "") - request1 = Request('https://example.com/a') + request1 = Request("https://example.com/a") self.assertEqual(smartproxy.process_request(request1, self.spider), None) self.assertEqual(httpproxy.process_request(request1, self.spider), None) - self.assertEqual(request1.meta['proxy'], 'http://proxy.zyte.com:8011') - self.assertEqual(request1.headers[b'Proxy-Authorization'], auth_header) + self.assertEqual(request1.meta["proxy"], "http://proxy.zyte.com:8011") + self.assertEqual(request1.headers[b"Proxy-Authorization"], auth_header) - request2 = Request('https://example.com/b', meta=dict(request1.meta)) - with patch('scrapy_zyte_smartproxy.middleware.logger') as logger: + request2 = Request("https://example.com/b", meta=dict(request1.meta)) + with patch("scrapy_zyte_smartproxy.middleware.logger") as logger: self.assertEqual(smartproxy.process_request(request2, self.spider), None) self.assertEqual(httpproxy.process_request(request2, self.spider), None) - self.assertEqual(request2.meta['proxy'], 'http://proxy.zyte.com:8011') - self.assertEqual(request2.headers[b'Proxy-Authorization'], auth_header) + self.assertEqual(request2.meta["proxy"], "http://proxy.zyte.com:8011") + self.assertEqual(request2.headers[b"Proxy-Authorization"], auth_header) expected_calls = [ call( "The value of the 'proxy' meta key of request {request2} " @@ -1356,14 +1421,14 @@ def test_manual_proxy_same(self): smartproxy = self.mwcls.from_crawler(crawler) smartproxy.open_spider(self.spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) - auth_header = basic_auth_header('apikey', '') + auth_header = basic_auth_header("apikey", "") - meta = {'proxy': 'http://apikey:@proxy.zyte.com:8011'} - request = Request('https://example.com', meta=meta) + meta = {"proxy": "http://apikey:@proxy.zyte.com:8011"} + request = Request("https://example.com", meta=meta) self.assertEqual(smartproxy.process_request(request, self.spider), None) self.assertEqual(httpproxy.process_request(request, self.spider), None) - self.assertEqual(request.meta['proxy'], 'http://proxy.zyte.com:8011') - self.assertEqual(request.headers[b'Proxy-Authorization'], auth_header) + self.assertEqual(request.meta["proxy"], "http://proxy.zyte.com:8011") + self.assertEqual(request.headers[b"Proxy-Authorization"], auth_header) def test_manual_proxy_without_api_key(self): """Defining the 'proxy' request meta key with the right URL but missing @@ -1373,15 +1438,15 @@ def test_manual_proxy_without_api_key(self): smartproxy = self.mwcls.from_crawler(crawler) smartproxy.open_spider(self.spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) - auth_header = basic_auth_header('apikey', '') + auth_header = basic_auth_header("apikey", "") - meta = {'proxy': 'http://proxy.zyte.com:8011'} - request = Request('https://example.com', meta=meta) - with patch('scrapy_zyte_smartproxy.middleware.logger') as logger: + meta = {"proxy": "http://proxy.zyte.com:8011"} + request = Request("https://example.com", meta=meta) + with patch("scrapy_zyte_smartproxy.middleware.logger") as logger: self.assertEqual(smartproxy.process_request(request, self.spider), None) self.assertEqual(httpproxy.process_request(request, self.spider), None) - self.assertEqual(request.meta['proxy'], 'http://proxy.zyte.com:8011') - self.assertEqual(request.headers[b'Proxy-Authorization'], auth_header) + self.assertEqual(request.meta["proxy"], "http://proxy.zyte.com:8011") + self.assertEqual(request.headers[b"Proxy-Authorization"], auth_header) expected_calls = [ call( "The value of the 'proxy' meta key of request {request} " @@ -1403,12 +1468,12 @@ def test_manual_proxy_different(self): smartproxy.open_spider(self.spider) httpproxy = HttpProxyMiddleware.from_crawler(crawler) - meta = {'proxy': 'http://proxy.example.com:8011'} - request = Request('https://example.com', meta=meta) + meta = {"proxy": "http://proxy.example.com:8011"} + request = Request("https://example.com", meta=meta) self.assertEqual(smartproxy.process_request(request, self.spider), None) self.assertEqual(httpproxy.process_request(request, self.spider), None) - self.assertEqual(request.meta['proxy'], 'http://proxy.example.com:8011') - self.assertNotIn(b'Proxy-Authorization', request.headers) + self.assertEqual(request.meta["proxy"], "http://proxy.example.com:8011") + self.assertNotIn(b"Proxy-Authorization", request.headers) def test_manual_proxy_different_auth(self): """Setting a custom 'proxy' request meta with a matching proxy URL @@ -1420,9 +1485,9 @@ def test_manual_proxy_different_auth(self): httpproxy = HttpProxyMiddleware.from_crawler(crawler) auth_header = basic_auth_header("altkey", "") - meta = {'proxy': 'http://altkey:@proxy.example.com:8011'} - request = Request('https://example.com', meta=meta) + meta = {"proxy": "http://altkey:@proxy.example.com:8011"} + request = Request("https://example.com", meta=meta) self.assertEqual(smartproxy.process_request(request, self.spider), None) self.assertEqual(httpproxy.process_request(request, self.spider), None) - self.assertEqual(request.meta['proxy'], 'http://proxy.example.com:8011') - self.assertEqual(request.headers[b'Proxy-Authorization'], auth_header) + self.assertEqual(request.meta["proxy"], "http://proxy.example.com:8011") + self.assertEqual(request.headers[b"Proxy-Authorization"], auth_header) From 226b896d474a82428df0d694793a01080aac0891 Mon Sep 17 00:00:00 2001 From: Emmanuel Rondan Date: Thu, 7 Nov 2024 10:00:57 -0300 Subject: [PATCH 6/8] ignoring commit for pre-commit hooks --- .git-blame-ignore-revs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index e9c6069..35a706f 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -1,2 +1,2 @@ # applying pre-commit hooks to the project -e00df278aa8602b18e7c3525191b843d88334c8f \ No newline at end of file +05665a6fb1717ef513d7a8ac87b8eb499a64cdc9 \ No newline at end of file From 2935dea8eba82a9186016d8f566154017f4e31e1 Mon Sep 17 00:00:00 2001 From: Emmanuel Rondan Date: Thu, 7 Nov 2024 10:07:32 -0300 Subject: [PATCH 7/8] adding flake8 file --- .flake8 | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..2bcd70e --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 88 From 0f50a65c5f49275f0a686a33f070ce608aa85080 Mon Sep 17 00:00:00 2001 From: Emmanuel Rondan Date: Thu, 7 Nov 2024 11:09:15 -0300 Subject: [PATCH 8/8] fixing github checks errors --- scrapy_zyte_smartproxy/middleware.py | 1 + tests/test_all.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/scrapy_zyte_smartproxy/middleware.py b/scrapy_zyte_smartproxy/middleware.py index c511f7f..ddc87de 100644 --- a/scrapy_zyte_smartproxy/middleware.py +++ b/scrapy_zyte_smartproxy/middleware.py @@ -3,6 +3,7 @@ import warnings from base64 import urlsafe_b64decode from collections import defaultdict +from typing import Dict, List # noqa try: from urllib.request import _parse_proxy # type: ignore diff --git a/tests/test_all.py b/tests/test_all.py index 71e1fd9..2844b78 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -1281,7 +1281,7 @@ def test_header_drop_warnings(self, mock_logger): ) self.assertEqual(mw.process_request(request, self.spider), None) mock_logger.warning.assert_called_with( - "Translating (and dropping) header %r (%r) as %r on request %r", + "Translating header %r (%r) to %r on request %r", b"zyte-device", [b"desktop"], b"x-crawlera-profile", @@ -1296,7 +1296,7 @@ def test_header_drop_warnings(self, mock_logger): ) self.assertEqual(mw.process_request(request, self.spider), None) mock_logger.warning.assert_called_with( - "Translating (and dropping) header %r (%r) as %r on request %r", + "Translating header %r (%r) to %r on request %r", b"x-crawlera-profile", [b"desktop"], b"zyte-device",