Skip to content

Commit e9dca8c

Browse files
authored
Merge pull request #45 from zivy/updateURLChecks
Improve URL checking.
2 parents 99c5870 + 4183f3f commit e9dca8c

File tree

3 files changed

+191
-52
lines changed

3 files changed

+191
-52
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ needed. This is equivalent to summarizing all activity on a feature branch versu
2121

2222
## Unreleased
2323

24+
## v0.9.6
25+
26+
### Changed
27+
28+
validation_utilities - Improvements to the URL checking code. Increase the chances of not getting a 403 (forbidden, server refused to authorize request) by mimicking a browser. This doesn't solve the issue when sites really don't want to be scanned by scripts/bots, just improves the chances.
29+
2430
## v0.9.4
2531

2632
### Changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
66

77
[project]
88
name = "ibex_imaging_knowledge_base_utilities"
9-
version = "0.9.4"
9+
version = "0.9.6"
1010
authors = [{ name="Ziv Yaniv", email="[email protected]" },
1111
]
1212
description = "Utility scripts used for managing the IBEX Imaging Community Knowledge-Base"

src/ibex_imaging_knowledge_base_utilities/data_validation/validation_utilities.py

Lines changed: 184 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -22,68 +22,175 @@
2222
from concurrent.futures import ThreadPoolExecutor
2323
import sys
2424
import pandas as pd
25+
import random
26+
from urllib.parse import urlparse
27+
from collections import defaultdict
2528

2629

2730
def url_exists(
2831
url,
32+
session,
2933
request_timeout=2,
3034
allow_redirects=True,
31-
request_headers={
32-
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36" # noqa E501
33-
},
35+
delay_between_tries=[0.1, 0.3],
3436
**kwargs,
3537
):
36-
"""
37-
Check if a url exists. All parameters supported by requests.head are available. Those not explicitly
38-
listed are forwarded as is to requests.head. Timeout exceptions are considered a success. Also,
39-
the 403 "forbidden" code (server understands the request but refuses to authorize it) is considered
40-
success too.
41-
"""
4238
try:
43-
# Using requests.head and not requests.get because
44-
# we don't need the webpage content.
45-
res = requests.head(
39+
# Using head and not get because
40+
# we don't need the webpage content so more eco friendly.
41+
res = session.head(
4642
url,
4743
timeout=request_timeout,
48-
headers=request_headers,
4944
allow_redirects=allow_redirects,
5045
**kwargs,
5146
)
47+
# head resulted in a 405 Method Not Allowed, try get after a short delay
48+
if res.status_code == 405:
49+
time.sleep(random.uniform(delay_between_tries[0], delay_between_tries[1]))
50+
res = session.get(
51+
url,
52+
timeout=request_timeout,
53+
allow_redirects=allow_redirects,
54+
stream=True, # don't download full content
55+
**kwargs,
56+
)
57+
res.close() # close the connection immediately
5258
if res.status_code == 403:
5359
print(f"{url}: 403 forbidden code, server refused to authorize request")
54-
# HTTP 200 status code for success
55-
return res.status_code in [200, 403]
60+
# HTTP 200 status code for success, 30x redirects and 403 forbidden. We consider
61+
# all these responses as indicating that the URL exists.
62+
return res.status_code in [200, 301, 302, 303, 307, 308, 403]
5663
except requests.exceptions.Timeout:
5764
print(f"{url}: timed out ({request_timeout}sec)")
5865
return True
66+
except requests.exceptions.SSLError:
67+
print(f"{url}: SSL error, but URL likely exists")
68+
return True
5969
except requests.exceptions.RequestException as e:
6070
print(f"{url}: {e}")
6171
return False
6272

6373

64-
def url_exists_with_retries(
65-
url,
74+
def urls_exist_with_retries(
75+
urls,
6676
retry_num=0,
77+
delay_between_tries=[0.1, 0.3],
6778
request_timeout=2,
6879
allow_redirects=True,
69-
request_headers={
70-
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36" # noqa E501
71-
},
80+
request_headers=None,
7281
**kwargs,
7382
):
74-
if url_exists(url, request_timeout, allow_redirects, request_headers, **kwargs):
75-
return True
76-
# Retry using exponential backoff + some randomness so that we don't get a bunch of
77-
# threads or processes all performing queries in a synchronized fashion.
78-
# We could have used the Retry built in strategy
79-
# (https://requests.readthedocs.io/en/latest/user/advanced/#example-automatic-retries,
80-
# https://www.zenrows.com/blog/python-requests-retry#best-methods-to-retry)
81-
# but it isn't more succinct or efficient than what is implemented here.
82-
for i in range(retry_num):
83-
time.sleep(pow(base=2, exp=i) + np.random.random())
84-
if url_exists(url, request_timeout, allow_redirects, request_headers, **kwargs):
85-
return True
86-
return False
83+
"""
84+
Check if a set of urls exist. All parameters supported by requests.head are available. Those not explicitly
85+
listed are forwarded as is to requests.head. Timeout exceptions are considered a success. Also,
86+
the 403 "forbidden" code (server understands the request but refuses to authorize it) is considered
87+
success too.
88+
"""
89+
# Use various user agents to reduce chances of blocking, 403 response.
90+
user_agents = [
91+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", # noqa: E501
92+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", # noqa: E501
93+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
94+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
95+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
96+
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
97+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15", # noqa: E501
98+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0", # noqa: E501
99+
]
100+
101+
if request_headers is None:
102+
# Create comprehensive browser-like headers to avoid 403 errors
103+
selected_user_agent = random.choice(user_agents)
104+
105+
# Determine browser type from user agent for consistent headers
106+
is_chrome = "Chrome" in selected_user_agent and "Edg" not in selected_user_agent
107+
is_firefox = "Firefox" in selected_user_agent
108+
is_safari = (
109+
"Safari" in selected_user_agent and "Chrome" not in selected_user_agent
110+
)
111+
is_edge = "Edg" in selected_user_agent
112+
113+
# Base headers that work for most sites
114+
request_headers = {
115+
"User-Agent": selected_user_agent,
116+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", # noqa: E501
117+
"Accept-Language": "en-US,en;q=0.9",
118+
"Accept-Encoding": "gzip, deflate, br",
119+
"DNT": "1",
120+
"Connection": "keep-alive",
121+
"Upgrade-Insecure-Requests": "1",
122+
"Sec-Fetch-Dest": "document",
123+
"Sec-Fetch-Mode": "navigate",
124+
"Sec-Fetch-Site": "none",
125+
"Sec-Fetch-User": "?1",
126+
"Cache-Control": "max-age=0",
127+
}
128+
129+
# Browser-specific headers for maximum authenticity
130+
if is_chrome or is_edge:
131+
request_headers.update(
132+
{
133+
"sec-ch-ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
134+
"sec-ch-ua-mobile": "?0",
135+
"sec-ch-ua-platform": (
136+
'"Windows"'
137+
if "Windows" in selected_user_agent
138+
else '"macOS"' if "Mac" in selected_user_agent else '"Linux"'
139+
),
140+
}
141+
)
142+
elif is_firefox:
143+
# Firefox doesn't send sec-ch-ua headers
144+
pass
145+
elif is_safari:
146+
# Safari has different sec headers
147+
request_headers.update(
148+
{
149+
"Sec-Fetch-Site": "same-origin",
150+
}
151+
)
152+
results = []
153+
with requests.Session() as session:
154+
session.headers.update(request_headers)
155+
for url in urls:
156+
if url_exists(
157+
url,
158+
session,
159+
request_timeout=request_timeout,
160+
allow_redirects=allow_redirects,
161+
**kwargs,
162+
):
163+
results.append(True)
164+
else:
165+
# Retry using exponential backoff + some randomness so that we don't get a bunch of
166+
# threads or processes all performing queries in a synchronized fashion.
167+
# We could have used the Retry built in strategy
168+
# (https://requests.readthedocs.io/en/latest/user/advanced/#example-automatic-retries,
169+
# https://www.zenrows.com/blog/python-requests-retry#best-methods-to-retry)
170+
# but it isn't more succinct or efficient than what is implemented here.
171+
i = 0
172+
not_successful = True
173+
while i < retry_num and not_successful:
174+
time.sleep(pow(base=2, exp=i) + np.random.random())
175+
if url_exists(
176+
url,
177+
session,
178+
request_timeout=request_timeout,
179+
allow_redirects=allow_redirects,
180+
**kwargs,
181+
):
182+
results.append(True)
183+
not_successful = False
184+
i += 1
185+
if not_successful:
186+
results.append(False)
187+
188+
# Add small delay between requests, especially for same domain
189+
if len(urls) > 1:
190+
time.sleep(
191+
random.uniform(delay_between_tries[0], delay_between_tries[1])
192+
)
193+
return results
87194

88195

89196
def check_urls(
@@ -92,30 +199,56 @@ def check_urls(
92199
retry_num=0,
93200
request_timeout=10,
94201
allow_redirects=True,
95-
request_headers={
96-
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36" # noqa E501
97-
},
202+
request_headers=None,
98203
**kwargs,
99204
):
100205
"""
101-
Check url existence for a number of urls in a container. It is assumed that the urls
102-
are for different hosts (if they are on the same host then better to use a
103-
requests.Session object).
206+
Check url existence for the urls given in the container.
207+
208+
This is a multi-threaded approach where each thread uses its own requests.Session
209+
object. URLs are grouped by domain and each group is checked using an independent
210+
thread and session so that multiple threads don't try to connect to the same host concurrently.
211+
This is a best effort in terms of mimicking browser requests to avoid 403 errors, though these
212+
are considered as successful. Unfortunately, this still does not pass the scrutiny of some
213+
sites (possibly protected for Denial Of Service attacks) and they
214+
return a 403 error. Timeout exceptions are also considered a success.
215+
Finally, some sites return a 404 error even when they exist (https://www.sysy.com/). This
216+
may be a server error or intentional when the site identifies that it is being queried by
217+
a script/bot.
218+
219+
Returns a list of boolean values indicating for each url in the container if it exists
220+
or not.
104221
"""
222+
223+
results_dict = {}
224+
# Split the urls by domain to avoid multiple threads concurrently trying to
225+
# connect to the same host. Each thread will use its own requests.Session.
226+
domain_groups = defaultdict(list)
227+
for url in urls_container:
228+
try:
229+
domain = urlparse(url).netloc
230+
domain_groups[domain].append(url)
231+
except Exception:
232+
# If URL parsing fails, then the URL is incorrect
233+
results_dict[url] = False
234+
105235
with ThreadPoolExecutor(num_threads) as executor:
106-
return list(
107-
executor.map(
108-
lambda x: url_exists_with_retries(
109-
url=x,
110-
retry_num=retry_num,
111-
request_timeout=request_timeout,
112-
allow_redirects=allow_redirects,
113-
request_headers=request_headers,
114-
**kwargs,
115-
),
116-
urls_container,
117-
)
236+
dict_values = domain_groups.values()
237+
res = executor.map(
238+
lambda x: urls_exist_with_retries(
239+
urls=x,
240+
retry_num=retry_num,
241+
request_timeout=request_timeout,
242+
allow_redirects=allow_redirects,
243+
request_headers=request_headers,
244+
**kwargs,
245+
),
246+
dict_values,
118247
)
248+
for url_list, result_list in zip(dict_values, res):
249+
for url, result in zip(url_list, result_list):
250+
results_dict[url] = result
251+
return [results_dict[url] for url in urls_container]
119252

120253

121254
def duplicated_entries_single_value_columns(df, column_names):

0 commit comments

Comments
 (0)