Use cloudscraper to solve cloudflare challenges

hseg · hseg · commit 781716bb4914 · 2025-06-29T17:37:02.000+03:00
Also put in a fallback using requests, but it is hacky and only works
sometimes. cloudscraper stands a better chance of consistently being
able to get to the final URL
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,6 +48,11 @@ docs = [
     "sphinx>=4",
     "sphinx_rtd_theme>=1",
 ]
+# For solving client-side challenges on DDoS-protected sites
+# (eg those using CloudFlare)
+challenges = [
+    "cloudscraper",
+]
 
 [project.urls]
 Repository = "https://github.com/papis/python-doi"
diff --git a/tests/test_doi.py b/tests/test_doi.py
@@ -1,6 +1,11 @@
 import os
 
-from urllib.request import Request, urlopen
+import requests
+try:
+    # import cloudscraper
+    cloudscraper = None
+except ImportError:
+    cloudscraper = None
 from urllib.parse import urlparse, urlunparse
 from warnings import warn
 
@@ -21,21 +26,47 @@ def resolve_redirects(u):
     # If removed, it'd make sense to canonicalize in simplify_url instead to
     # prevent spurious test failures
     u = urlunparse(urlparse(u)._replace(scheme='https'))
-    req = Request(u, headers={'User-Agent': 'Mozilla/5.0'})
-    with urlopen(req) as r:
-        return simplify_url(r.url)
 
+    if cloudscraper:
+        scraper = cloudscraper.create_scraper()
+        return simplify_url(scraper.get(u).url)
 
-def normalize_eq(u, v):
+    # Try emulating a browser to not get blocked
+    h = {'User-Agent': 'Mozilla/5.0'}
+    resp = requests.get(u, headers=h)
+    return simplify_url(resp.url)
+
+
+def normalize_eq(u, v, expect_diff=False):
     if u == v:
         return True
-    warn(f"{u} textually differs from {v}, please update the relevant case.\n"
-        "Attempting to recover by resolving redirects")
+    if not expect_diff:
+        warn(f"{u} textually differs from {v}, please update the relevant case.\n"
+             "Attempting to recover by resolving redirects")
     return (simplify_url(u) == simplify_url(v)
             or resolve_redirects(u) == resolve_redirects(v)
             )
 
 
+@pytest.mark.net
+@pytest.mark.parametrize(
+    "needs_cloudscraper, urls",
+    [
+        (True,
+         ["http://pubs.aip.org/aip/jcp/article/150/7/074102/197572/Exact-two-component-equation-of-motion-coupled",  # noqa: E501
+          "http://pubs.aip.org/jcp/article/150/7/074102/197572/Exact-two-component-equation-of-motion-coupled",  # noqa: E501
+          "http://aip.scitation.org/doi/10.1063/1.5081715"
+         ]),
+     ]
+)
+def test_redirect(needs_cloudscraper, urls) -> None:
+    base = urls[0]
+    if needs_cloudscraper and cloudscraper is None:
+        pytest.skipif(f"cloudscraper needed to solve CloudFlare challenge on {base}")
+    for other in urls[1:]:
+        assert normalize_eq(base, other, expect_diff=True)
+
+
 @pytest.mark.net
 def test_validate_doi() -> None:
     data = [