Skip to content

Commit 781716b

Browse files
committed
Use cloudscraper to solve cloudflare challenges
Also put in a fallback using requests, but it is hacky and only works sometimes. cloudscraper stands a better chance of consistently being able to get to the final URL
1 parent 2e4b622 commit 781716b

File tree

2 files changed

+43
-7
lines changed

2 files changed

+43
-7
lines changed

pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ docs = [
4848
"sphinx>=4",
4949
"sphinx_rtd_theme>=1",
5050
]
51+
# For solving client-side challenges on DDoS-protected sites
52+
# (eg those using CloudFlare)
53+
challenges = [
54+
"cloudscraper",
55+
]
5156

5257
[project.urls]
5358
Repository = "https://github.com/papis/python-doi"

tests/test_doi.py

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
import os
22

3-
from urllib.request import Request, urlopen
3+
import requests
4+
try:
5+
# import cloudscraper
6+
cloudscraper = None
7+
except ImportError:
8+
cloudscraper = None
49
from urllib.parse import urlparse, urlunparse
510
from warnings import warn
611

@@ -21,21 +26,47 @@ def resolve_redirects(u):
2126
# If removed, it'd make sense to canonicalize in simplify_url instead to
2227
# prevent spurious test failures
2328
u = urlunparse(urlparse(u)._replace(scheme='https'))
24-
req = Request(u, headers={'User-Agent': 'Mozilla/5.0'})
25-
with urlopen(req) as r:
26-
return simplify_url(r.url)
2729

30+
if cloudscraper:
31+
scraper = cloudscraper.create_scraper()
32+
return simplify_url(scraper.get(u).url)
2833

29-
def normalize_eq(u, v):
34+
# Try emulating a browser to not get blocked
35+
h = {'User-Agent': 'Mozilla/5.0'}
36+
resp = requests.get(u, headers=h)
37+
return simplify_url(resp.url)
38+
39+
40+
def normalize_eq(u, v, expect_diff=False):
3041
if u == v:
3142
return True
32-
warn(f"{u} textually differs from {v}, please update the relevant case.\n"
33-
"Attempting to recover by resolving redirects")
43+
if not expect_diff:
44+
warn(f"{u} textually differs from {v}, please update the relevant case.\n"
45+
"Attempting to recover by resolving redirects")
3446
return (simplify_url(u) == simplify_url(v)
3547
or resolve_redirects(u) == resolve_redirects(v)
3648
)
3749

3850

51+
@pytest.mark.net
52+
@pytest.mark.parametrize(
53+
"needs_cloudscraper, urls",
54+
[
55+
(True,
56+
["http://pubs.aip.org/aip/jcp/article/150/7/074102/197572/Exact-two-component-equation-of-motion-coupled", # noqa: E501
57+
"http://pubs.aip.org/jcp/article/150/7/074102/197572/Exact-two-component-equation-of-motion-coupled", # noqa: E501
58+
"http://aip.scitation.org/doi/10.1063/1.5081715"
59+
]),
60+
]
61+
)
62+
def test_redirect(needs_cloudscraper, urls) -> None:
63+
base = urls[0]
64+
if needs_cloudscraper and cloudscraper is None:
65+
pytest.skipif(f"cloudscraper needed to solve CloudFlare challenge on {base}")
66+
for other in urls[1:]:
67+
assert normalize_eq(base, other, expect_diff=True)
68+
69+
3970
@pytest.mark.net
4071
def test_validate_doi() -> None:
4172
data = [

0 commit comments

Comments
 (0)