Skip to content

Commit ca6dcf0

Browse files
committed
Use cloudscraper to solve cloudflare challenges
Also put in a fallback using requests, but it is hacky and only works sometimes. cloudscraper stands a better chance of consistently being able to get to the final URL
1 parent 2e4b622 commit ca6dcf0

File tree

2 files changed

+42
-7
lines changed

2 files changed

+42
-7
lines changed

pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ docs = [
4848
"sphinx>=4",
4949
"sphinx_rtd_theme>=1",
5050
]
51+
# For solving client-side challenges on DDoS-protected sites
52+
# (eg those using CloudFlare)
53+
challenges = [
54+
"cloudscraper",
55+
]
5156

5257
[project.urls]
5358
Repository = "https://github.com/papis/python-doi"

tests/test_doi.py

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import os
22

3-
from urllib.request import Request, urlopen
3+
import requests
4+
try:
5+
import cloudscraper
6+
except ImportError:
7+
cloudscraper = None
48
from urllib.parse import urlparse, urlunparse
59
from warnings import warn
610

@@ -21,21 +25,47 @@ def resolve_redirects(u):
2125
# If removed, it'd make sense to canonicalize in simplify_url instead to
2226
# prevent spurious test failures
2327
u = urlunparse(urlparse(u)._replace(scheme='https'))
24-
req = Request(u, headers={'User-Agent': 'Mozilla/5.0'})
25-
with urlopen(req) as r:
26-
return simplify_url(r.url)
2728

29+
if cloudscraper:
30+
scraper = cloudscraper.create_scraper()
31+
return simplify_url(scraper.get(u).url)
2832

29-
def normalize_eq(u, v):
33+
# Try emulating a browser to not get blocked
34+
h = {'User-Agent': 'Mozilla/5.0'}
35+
resp = requests.get(u, headers=h)
36+
return simplify_url(resp.url)
37+
38+
39+
def normalize_eq(u, v, expect_diff=False):
3040
if u == v:
3141
return True
32-
warn(f"{u} textually differs from {v}, please update the relevant case.\n"
33-
"Attempting to recover by resolving redirects")
42+
if not expect_diff:
43+
warn(f"{u} textually differs from {v}, please update the relevant case.\n"
44+
"Attempting to recover by resolving redirects")
3445
return (simplify_url(u) == simplify_url(v)
3546
or resolve_redirects(u) == resolve_redirects(v)
3647
)
3748

3849

50+
@pytest.mark.net
51+
@pytest.mark.parametrize(
52+
"needs_cloudscraper, urls",
53+
[
54+
(True,
55+
["http://pubs.aip.org/aip/jcp/article/150/7/074102/197572/Exact-two-component-equation-of-motion-coupled", # noqa: E501
56+
"http://pubs.aip.org/jcp/article/150/7/074102/197572/Exact-two-component-equation-of-motion-coupled", # noqa: E501
57+
"http://aip.scitation.org/doi/10.1063/1.5081715"
58+
]),
59+
]
60+
)
61+
def test_redirect(needs_cloudscraper, urls) -> None:
62+
base = urls[0]
63+
if needs_cloudscraper and cloudscraper is None:
64+
pytest.skip(f"cloudscraper needed to solve CloudFlare challenge on {base}")
65+
for other in urls[1:]:
66+
assert normalize_eq(base, other, expect_diff=True)
67+
68+
3969
@pytest.mark.net
4070
def test_validate_doi() -> None:
4171
data = [

0 commit comments

Comments
 (0)