2222from concurrent .futures import ThreadPoolExecutor
2323import sys
2424import pandas as pd
25+ import random
26+ from urllib .parse import urlparse
27+ from collections import defaultdict
2528
2629
2730def url_exists (
2831 url ,
32+ session ,
2933 request_timeout = 2 ,
3034 allow_redirects = True ,
31- request_headers = {
32- "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36" # noqa E501
33- },
35+ delay_between_tries = [0.1 , 0.3 ],
3436 ** kwargs ,
3537):
36- """
37- Check if a url exists. All parameters supported by requests.head are available. Those not explicitly
38- listed are forwarded as is to requests.head. Timeout exceptions are considered a success. Also,
39- the 403 "forbidden" code (server understands the request but refuses to authorize it) is considered
40- success too.
41- """
4238 try :
43- # Using requests. head and not requests. get because
44- # we don't need the webpage content.
45- res = requests .head (
39+ # Using head and not get because
40+ # we don't need the webpage content so more eco friendly .
41+ res = session .head (
4642 url ,
4743 timeout = request_timeout ,
48- headers = request_headers ,
4944 allow_redirects = allow_redirects ,
5045 ** kwargs ,
5146 )
47+ # head resulted in a 405 Method Not Allowed, try get after a short delay
48+ if res .status_code == 405 :
49+ time .sleep (random .uniform (delay_between_tries [0 ], delay_between_tries [1 ]))
50+ res = session .get (
51+ url ,
52+ timeout = request_timeout ,
53+ allow_redirects = allow_redirects ,
54+ stream = True , # don't download full content
55+ ** kwargs ,
56+ )
57+ res .close () # close the connection immediately
5258 if res .status_code == 403 :
5359 print (f"{ url } : 403 forbidden code, server refused to authorize request" )
54- # HTTP 200 status code for success
55- return res .status_code in [200 , 403 ]
60+ # HTTP 200 status code for success, 30x redirects and 403 forbidden. We consider
61+ # all these responses as indicating that the URL exists.
62+ return res .status_code in [200 , 301 , 302 , 303 , 307 , 308 , 403 ]
5663 except requests .exceptions .Timeout :
5764 print (f"{ url } : timed out ({ request_timeout } sec)" )
5865 return True
66+ except requests .exceptions .SSLError :
67+ print (f"{ url } : SSL error, but URL likely exists" )
68+ return True
5969 except requests .exceptions .RequestException as e :
6070 print (f"{ url } : { e } " )
6171 return False
6272
6373
64- def url_exists_with_retries (
65- url ,
74+ def urls_exist_with_retries (
75+ urls ,
6676 retry_num = 0 ,
77+ delay_between_tries = [0.1 , 0.3 ],
6778 request_timeout = 2 ,
6879 allow_redirects = True ,
69- request_headers = {
70- "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36" # noqa E501
71- },
80+ request_headers = None ,
7281 ** kwargs ,
7382):
74- if url_exists (url , request_timeout , allow_redirects , request_headers , ** kwargs ):
75- return True
76- # Retry using exponential backoff + some randomness so that we don't get a bunch of
77- # threads or processes all performing queries in a synchronized fashion.
78- # We could have used the Retry built in strategy
79- # (https://requests.readthedocs.io/en/latest/user/advanced/#example-automatic-retries,
80- # https://www.zenrows.com/blog/python-requests-retry#best-methods-to-retry)
81- # but it isn't more succinct or efficient than what is implemented here.
82- for i in range (retry_num ):
83- time .sleep (pow (base = 2 , exp = i ) + np .random .random ())
84- if url_exists (url , request_timeout , allow_redirects , request_headers , ** kwargs ):
85- return True
86- return False
83+ """
84+ Check if a set of urls exist. All parameters supported by requests.head are available. Those not explicitly
85+ listed are forwarded as is to requests.head. Timeout exceptions are considered a success. Also,
86+ the 403 "forbidden" code (server understands the request but refuses to authorize it) is considered
87+ success too.
88+ """
89+ # Use various user agents to reduce chances of blocking, 403 response.
90+ user_agents = [
91+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" , # noqa: E501
92+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" , # noqa: E501
93+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ,
94+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0" ,
95+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0" ,
96+ "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0" ,
97+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15" , # noqa: E501
98+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0" , # noqa: E501
99+ ]
100+
101+ if request_headers is None :
102+ # Create comprehensive browser-like headers to avoid 403 errors
103+ selected_user_agent = random .choice (user_agents )
104+
105+ # Determine browser type from user agent for consistent headers
106+ is_chrome = "Chrome" in selected_user_agent and "Edg" not in selected_user_agent
107+ is_firefox = "Firefox" in selected_user_agent
108+ is_safari = (
109+ "Safari" in selected_user_agent and "Chrome" not in selected_user_agent
110+ )
111+ is_edge = "Edg" in selected_user_agent
112+
113+ # Base headers that work for most sites
114+ request_headers = {
115+ "User-Agent" : selected_user_agent ,
116+ "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" , # noqa: E501
117+ "Accept-Language" : "en-US,en;q=0.9" ,
118+ "Accept-Encoding" : "gzip, deflate, br" ,
119+ "DNT" : "1" ,
120+ "Connection" : "keep-alive" ,
121+ "Upgrade-Insecure-Requests" : "1" ,
122+ "Sec-Fetch-Dest" : "document" ,
123+ "Sec-Fetch-Mode" : "navigate" ,
124+ "Sec-Fetch-Site" : "none" ,
125+ "Sec-Fetch-User" : "?1" ,
126+ "Cache-Control" : "max-age=0" ,
127+ }
128+
129+ # Browser-specific headers for maximum authenticity
130+ if is_chrome or is_edge :
131+ request_headers .update (
132+ {
133+ "sec-ch-ua" : '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"' ,
134+ "sec-ch-ua-mobile" : "?0" ,
135+ "sec-ch-ua-platform" : (
136+ '"Windows"'
137+ if "Windows" in selected_user_agent
138+ else '"macOS"' if "Mac" in selected_user_agent else '"Linux"'
139+ ),
140+ }
141+ )
142+ elif is_firefox :
143+ # Firefox doesn't send sec-ch-ua headers
144+ pass
145+ elif is_safari :
146+ # Safari has different sec headers
147+ request_headers .update (
148+ {
149+ "Sec-Fetch-Site" : "same-origin" ,
150+ }
151+ )
152+ results = []
153+ with requests .Session () as session :
154+ session .headers .update (request_headers )
155+ for url in urls :
156+ if url_exists (
157+ url ,
158+ session ,
159+ request_timeout = request_timeout ,
160+ allow_redirects = allow_redirects ,
161+ ** kwargs ,
162+ ):
163+ results .append (True )
164+ else :
165+ # Retry using exponential backoff + some randomness so that we don't get a bunch of
166+ # threads or processes all performing queries in a synchronized fashion.
167+ # We could have used the Retry built in strategy
168+ # (https://requests.readthedocs.io/en/latest/user/advanced/#example-automatic-retries,
169+ # https://www.zenrows.com/blog/python-requests-retry#best-methods-to-retry)
170+ # but it isn't more succinct or efficient than what is implemented here.
171+ i = 0
172+ not_successful = True
173+ while i < retry_num and not_successful :
174+ time .sleep (pow (base = 2 , exp = i ) + np .random .random ())
175+ if url_exists (
176+ url ,
177+ session ,
178+ request_timeout = request_timeout ,
179+ allow_redirects = allow_redirects ,
180+ ** kwargs ,
181+ ):
182+ results .append (True )
183+ not_successful = False
184+ i += 1
185+ if not_successful :
186+ results .append (False )
187+
188+ # Add small delay between requests, especially for same domain
189+ if len (urls ) > 1 :
190+ time .sleep (
191+ random .uniform (delay_between_tries [0 ], delay_between_tries [1 ])
192+ )
193+ return results
87194
88195
89196def check_urls (
@@ -92,30 +199,56 @@ def check_urls(
92199 retry_num = 0 ,
93200 request_timeout = 10 ,
94201 allow_redirects = True ,
95- request_headers = {
96- "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36" # noqa E501
97- },
202+ request_headers = None ,
98203 ** kwargs ,
99204):
100205 """
101- Check url existence for a number of urls in a container. It is assumed that the urls
102- are for different hosts (if they are on the same host then better to use a
103- requests.Session object).
206+ Check url existence for the urls given in the container.
207+
208+ This is a multi-threaded approach where each thread uses its own requests.Session
209+ object. URLs are grouped by domain and each group is checked using an independent
210+ thread and session so that multiple threads don't try to connect to the same host concurrently.
211+ This is a best effort in terms of mimicking browser requests to avoid 403 errors, though these
212+ are considered as successful. Unfortunately, this still does not pass the scrutiny of some
213+ sites (possibly protected for Denial Of Service attacks) and they
214+ return a 403 error. Timeout exceptions are also considered a success.
215+ Finally, some sites return a 404 error even when they exist (https://www.sysy.com/). This
216+ may be a server error or intentional when the site identifies that it is being queried by
217+ a script/bot.
218+
219+ Returns a list of boolean values indicating for each url in the container if it exists
220+ or not.
104221 """
222+
223+ results_dict = {}
224+ # Split the urls by domain to avoid multiple threads concurrently trying to
225+ # connect to the same host. Each thread will use its own requests.Session.
226+ domain_groups = defaultdict (list )
227+ for url in urls_container :
228+ try :
229+ domain = urlparse (url ).netloc
230+ domain_groups [domain ].append (url )
231+ except Exception :
232+ # If URL parsing fails, then the URL is incorrect
233+ results_dict [url ] = False
234+
105235 with ThreadPoolExecutor (num_threads ) as executor :
106- return list (
107- executor .map (
108- lambda x : url_exists_with_retries (
109- url = x ,
110- retry_num = retry_num ,
111- request_timeout = request_timeout ,
112- allow_redirects = allow_redirects ,
113- request_headers = request_headers ,
114- ** kwargs ,
115- ),
116- urls_container ,
117- )
236+ dict_values = domain_groups .values ()
237+ res = executor .map (
238+ lambda x : urls_exist_with_retries (
239+ urls = x ,
240+ retry_num = retry_num ,
241+ request_timeout = request_timeout ,
242+ allow_redirects = allow_redirects ,
243+ request_headers = request_headers ,
244+ ** kwargs ,
245+ ),
246+ dict_values ,
118247 )
248+ for url_list , result_list in zip (dict_values , res ):
249+ for url , result in zip (url_list , result_list ):
250+ results_dict [url ] = result
251+ return [results_dict [url ] for url in urls_container ]
119252
120253
121254def duplicated_entries_single_value_columns (df , column_names ):
0 commit comments