From daef1c5ec607f32e632d798fd3d107c59dddccb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 20 Nov 2023 15:51:09 +0100 Subject: [PATCH] Fix Zyte API support (#112) --- docs/headers.rst | 9 ++++----- docs/index.rst | 18 +++++++++--------- docs/news.rst | 9 ++++++++- docs/settings.rst | 12 ++++++------ scrapy_zyte_smartproxy/middleware.py | 8 +++----- tests/test_all.py | 4 +--- 6 files changed, 31 insertions(+), 29 deletions(-) diff --git a/docs/headers.rst b/docs/headers.rst index f94666d..c999fde 100644 --- a/docs/headers.rst +++ b/docs/headers.rst @@ -1,13 +1,13 @@ Headers ======= -The Zyte proxy API services that you can use with this downloader middleware -each support a different set of HTTP request and response headers that give -you access to additional features. You can find more information about those +The Zyte proxy services that you can use with this downloader middleware each +support a different set of HTTP request and response headers that give you +access to additional features. You can find more information about those headers in the documentation of each service, `Zyte API’s `_ and `Zyte Smart Proxy Manager’s `_. -.. _zyte-api-headers: https://docs.zyte.com/zyte-api/usage/proxy-api.html +.. _zyte-api-headers: https://docs.zyte.com/zyte-api/usage/proxy-mode.html .. _spm-headers: https://docs.zyte.com/smart-proxy-manager.html#request-headers If you try to use a header for one service while using the other service, this @@ -24,7 +24,6 @@ Translation is supported for the following headers: ========================= =========================== Zyte API Zyte Smart Proxy Manager ========================= =========================== -``Zyte-Client`` ``X-Crawlera-Client`` ``Zyte-Device`` ``X-Crawlera-Profile`` ``Zyte-Error`` ``X-Crawlera-Error`` ``Zyte-Geolocation`` ``X-Crawlera-Region`` diff --git a/docs/index.rst b/docs/index.rst index a6783cf..5d2335d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,10 +10,11 @@ scrapy-zyte-smartproxy |version| documentation news scrapy-zyte-smartproxy is a `Scrapy downloader middleware`_ to use one of -Zyte’s proxy APIs: either the proxy API of `Zyte API`_ or `Zyte Smart Proxy -Manager`_ (formerly Crawlera). +Zyte’s proxy services: either the `proxy mode`_ of `Zyte API`_ or `Zyte Smart +Proxy Manager`_ (formerly Crawlera). .. _Scrapy downloader middleware: https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +.. _proxy mode: https://docs.zyte.com/zyte-api/usage/proxy-mode.html .. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html .. _Zyte Smart Proxy Manager: https://www.zyte.com/smart-proxy-manager/ @@ -52,7 +53,7 @@ Configuration #. Set the ``ZYTE_SMARTPROXY_URL`` Scrapy setting as needed: - - To use the proxy API of Zyte API, set it to + - To use the `proxy mode`_ of `Zyte API`_, set it to ``http://api.zyte.com:8011``: .. code-block:: python @@ -76,14 +77,13 @@ Usage ===== Once the downloader middleware is properly configured, every request goes -through the configured Zyte proxy API. +through the configured Zyte proxy service. .. _override: -Although the plugin configuration only allows defining a single proxy API -endpoint and API key, it is possible to override them for specific requests, so -that you can use different combinations for different requests within the same -spider. +Although the plugin configuration only allows defining a single proxy endpoint +and API key, it is possible to override them for specific requests, so that you +can use different combinations for different requests within the same spider. To **override** which combination of endpoint and API key is used for a given request, set ``proxy`` in the request metadata to a URL indicating both the @@ -128,7 +128,7 @@ or using the DEFAULT_REQUEST_HEADERS_ setting. For example: }, ) -.. _Zyte API proxy headers: https://docs.zyte.com/zyte-api/usage/proxy-api.html +.. _Zyte API proxy headers: https://docs.zyte.com/zyte-api/usage/proxy-mode.html .. _Zyte Smart Proxy Manager headers: https://docs.zyte.com/smart-proxy-manager.html#request-headers .. _Scrapy headers: https://doc.scrapy.org/en/latest/topics/request-response.html#scrapy.http.Request.headers .. _DEFAULT_REQUEST_HEADERS: https://doc.scrapy.org/en/latest/topics/settings.html#default-request-headers diff --git a/docs/news.rst b/docs/news.rst index b84bacd..51a8e7b 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -3,11 +3,18 @@ Changes ======= +v2.3.1 (2023-11-20) +------------------- + +Fixed `Zyte API`_ `proxy mode`_ support by removing the mapping of unsupported +headers ``Zyte-Client`` and ``Zyte-No-Bancheck``. + v2.3.0 (2023-10-20) ------------------- -Added support for the upcoming proxy API of `Zyte API`_. +Added support for the upcoming `proxy mode`_ of `Zyte API`_. +.. _proxy mode: https://docs.zyte.com/zyte-api/usage/proxy-mode.html .. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html Added a BSD-3-Clause license file. diff --git a/docs/settings.rst b/docs/settings.rst index 81a1bf3..e0aeff0 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -3,14 +3,14 @@ Settings ======== This Scrapy downloader middleware adds some settings to configure how to work -with your Zyte proxy API. +with your Zyte proxy service. ZYTE_SMARTPROXY_APIKEY ---------------------- Default: ``None`` -Default API key for your Zyte proxy API service. +Default API key for your Zyte proxy service. Note that Zyte API and Zyte Smart Proxy Manager have different API keys. @@ -22,7 +22,7 @@ ZYTE_SMARTPROXY_URL Default: ``'http://proxy.zyte.com:8011'`` -Default endpoint for your Zyte proxy API service. +Default endpoint for your Zyte proxy service. For guidelines on setting a value, see the :ref:`initial configuration instructions `. @@ -79,9 +79,9 @@ ZYTE_SMARTPROXY_FORCE_ENABLE_ON_HTTP_CODES Default: ``[]`` -List of HTTP response status codes that warrant enabling your Zyte proxy API +List of HTTP response status codes that warrant enabling your Zyte proxy service for the corresponding domain. When a response with one of these HTTP status codes is received after an -unproxied request, the request is retried with your Zyte proxy API service, and -any new request to the same domain is also proxied. +unproxied request, the request is retried with your Zyte proxy service, and any +new request to the same domain is also proxied. diff --git a/scrapy_zyte_smartproxy/middleware.py b/scrapy_zyte_smartproxy/middleware.py index b8d94c1..31ef82b 100644 --- a/scrapy_zyte_smartproxy/middleware.py +++ b/scrapy_zyte_smartproxy/middleware.py @@ -37,11 +37,9 @@ class ZyteSmartProxyMiddleware(object): enabled_for_domain = {} apikey = "" zyte_api_to_spm_translations = { - b"zyte-client": b"x-crawlera-client", b"zyte-device": b"x-crawlera-profile", b"zyte-geolocation": b"x-crawlera-region", b"zyte-jobid": b"x-crawlera-jobid", - b"zyte-no-bancheck": b"x-crawlera-no-bancheck", b"zyte-override-headers": b"x-crawlera-profile-pass", } spm_to_zyte_api_translations = {v: k for k, v in zyte_api_to_spm_translations.items()} @@ -222,9 +220,9 @@ def process_request(self, request, spider): if self.job_id: job_header = 'Zyte-JobId' if targets_zyte_api else 'X-Crawlera-JobId' request.headers[job_header] = self.job_id - client_header = 'Zyte-Client' if targets_zyte_api else 'X-Crawlera-Client' - from scrapy_zyte_smartproxy import __version__ - request.headers[client_header] = 'scrapy-zyte-smartproxy/%s' % __version__ + if not targets_zyte_api: + from scrapy_zyte_smartproxy import __version__ + request.headers['X-Crawlera-Client'] = 'scrapy-zyte-smartproxy/%s' % __version__ self.crawler.stats.inc_value('zyte_smartproxy/request') self.crawler.stats.inc_value('zyte_smartproxy/request/method/%s' % request.method) self._translate_headers(request, targets_zyte_api=targets_zyte_api) diff --git a/tests/test_all.py b/tests/test_all.py index 0628866..b51abac 100644 --- a/tests/test_all.py +++ b/tests/test_all.py @@ -983,7 +983,7 @@ def test_client_header(self): ) self.assertEqual(mw.process_request(req2, self.spider), None) self.assertEqual(req2.headers.get('X-Crawlera-Client'), None) - self.assertEqual(req2.headers.get('Zyte-Client'), client) + self.assertEqual(req2.headers.get('Zyte-Client'), None) def test_scrapy_httpproxy_integration(self): self.spider.zyte_smartproxy_enabled = True @@ -1062,11 +1062,9 @@ def test_header_translation(self): value = b"foo" zyte_api_to_spm_translations = { - b"Zyte-Client": b"X-Crawlera-Client", b"Zyte-Device": b"X-Crawlera-Profile", b"Zyte-Geolocation": b"X-Crawlera-Region", b"Zyte-JobId": b"X-Crawlera-JobId", - b"Zyte-No-Bancheck": b"X-Crawlera-No-Bancheck", b"Zyte-Override-Headers": b"X-Crawlera-Profile-Pass", } for header, translation in zyte_api_to_spm_translations.items():