2222from  concurrent .futures  import  ThreadPoolExecutor 
2323import  sys 
2424import  pandas  as  pd 
25+ import  random 
26+ from  urllib .parse  import  urlparse 
27+ from  collections  import  defaultdict 
2528
2629
2730def  url_exists (
2831    url ,
32+     session ,
2933    request_timeout = 2 ,
3034    allow_redirects = True ,
31-     request_headers = {
32-         "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"   # noqa E501 
33-     },
35+     delay_between_tries = [0.1 , 0.3 ],
3436    ** kwargs ,
3537):
36-     """ 
37-     Check if a url exists. All parameters supported by requests.head are available. Those not explicitly 
38-     listed are forwarded as is to requests.head. Timeout exceptions are considered a success. Also, 
39-     the 403 "forbidden" code (server understands the request but refuses to authorize it) is considered 
40-     success too. 
41-     """ 
4238    try :
43-         # Using requests. head and not requests. get because 
44-         # we don't need the webpage content. 
45-         res  =  requests .head (
39+         # Using head and not get because 
40+         # we don't need the webpage content so more eco friendly . 
41+         res  =  session .head (
4642            url ,
4743            timeout = request_timeout ,
48-             headers = request_headers ,
4944            allow_redirects = allow_redirects ,
5045            ** kwargs ,
5146        )
47+         # head resulted in a 405 Method Not Allowed, try get after a short delay 
48+         if  res .status_code  ==  405 :
49+             time .sleep (random .uniform (delay_between_tries [0 ], delay_between_tries [1 ]))
50+             res  =  session .get (
51+                 url ,
52+                 timeout = request_timeout ,
53+                 allow_redirects = allow_redirects ,
54+                 stream = True ,  # don't download full content 
55+                 ** kwargs ,
56+             )
57+             res .close ()  # close the connection immediately 
5258        if  res .status_code  ==  403 :
5359            print (f"{ url }  )
54-         # HTTP 200 status code for success 
55-         return  res .status_code  in  [200 , 403 ]
60+         # HTTP 200 status code for success, 30x redirects and 403 forbidden. We consider 
61+         # all these responses as indicating that the URL exists. 
62+         return  res .status_code  in  [200 , 301 , 302 , 303 , 307 , 308 , 403 ]
5663    except  requests .exceptions .Timeout :
5764        print (f"{ url } { request_timeout }  )
5865        return  True 
66+     except  requests .exceptions .SSLError :
67+         print (f"{ url }  )
68+         return  True 
5969    except  requests .exceptions .RequestException  as  e :
6070        print (f"{ url } { e }  )
6171        return  False 
6272
6373
64- def  url_exists_with_retries (
65-     url ,
74+ def  urls_exist_with_retries (
75+     urls ,
6676    retry_num = 0 ,
77+     delay_between_tries = [0.1 , 0.3 ],
6778    request_timeout = 2 ,
6879    allow_redirects = True ,
69-     request_headers = {
70-         "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"   # noqa E501 
71-     },
80+     request_headers = None ,
7281    ** kwargs ,
7382):
74-     if  url_exists (url , request_timeout , allow_redirects , request_headers , ** kwargs ):
75-         return  True 
76-     # Retry using exponential backoff + some randomness so that we don't get a bunch of 
77-     # threads or processes all performing queries in a synchronized fashion. 
78-     # We could have used the Retry built in strategy 
79-     # (https://requests.readthedocs.io/en/latest/user/advanced/#example-automatic-retries, 
80-     # https://www.zenrows.com/blog/python-requests-retry#best-methods-to-retry) 
81-     # but it isn't more succinct or efficient than what is implemented here. 
82-     for  i  in  range (retry_num ):
83-         time .sleep (pow (base = 2 , exp = i ) +  np .random .random ())
84-         if  url_exists (url , request_timeout , allow_redirects , request_headers , ** kwargs ):
85-             return  True 
86-     return  False 
83+     """ 
84+     Check if a set of urls exist. All parameters supported by requests.head are available. Those not explicitly 
85+     listed are forwarded as is to requests.head. Timeout exceptions are considered a success. Also, 
86+     the 403 "forbidden" code (server understands the request but refuses to authorize it) is considered 
87+     success too. 
88+     """ 
89+     # Use various user agents to reduce chances of blocking, 403 response. 
90+     user_agents  =  [
91+         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ,  # noqa: E501 
92+         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ,  # noqa: E501 
93+         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ,
94+         "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0" ,
95+         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0" ,
96+         "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0" ,
97+         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15" ,  # noqa: E501 
98+         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0" ,  # noqa: E501 
99+     ]
100+ 
101+     if  request_headers  is  None :
102+         # Create comprehensive browser-like headers to avoid 403 errors 
103+         selected_user_agent  =  random .choice (user_agents )
104+ 
105+         # Determine browser type from user agent for consistent headers 
106+         is_chrome  =  "Chrome"  in  selected_user_agent  and  "Edg"  not  in selected_user_agent 
107+         is_firefox  =  "Firefox"  in  selected_user_agent 
108+         is_safari  =  (
109+             "Safari"  in  selected_user_agent  and  "Chrome"  not  in selected_user_agent 
110+         )
111+         is_edge  =  "Edg"  in  selected_user_agent 
112+ 
113+         # Base headers that work for most sites 
114+         request_headers  =  {
115+             "User-Agent" : selected_user_agent ,
116+             "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" ,  # noqa: E501 
117+             "Accept-Language" : "en-US,en;q=0.9" ,
118+             "Accept-Encoding" : "gzip, deflate, br" ,
119+             "DNT" : "1" ,
120+             "Connection" : "keep-alive" ,
121+             "Upgrade-Insecure-Requests" : "1" ,
122+             "Sec-Fetch-Dest" : "document" ,
123+             "Sec-Fetch-Mode" : "navigate" ,
124+             "Sec-Fetch-Site" : "none" ,
125+             "Sec-Fetch-User" : "?1" ,
126+             "Cache-Control" : "max-age=0" ,
127+         }
128+ 
129+         # Browser-specific headers for maximum authenticity 
130+         if  is_chrome  or  is_edge :
131+             request_headers .update (
132+                 {
133+                     "sec-ch-ua" : '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"' ,
134+                     "sec-ch-ua-mobile" : "?0" ,
135+                     "sec-ch-ua-platform" : (
136+                         '"Windows"' 
137+                         if  "Windows"  in  selected_user_agent 
138+                         else  '"macOS"'  if  "Mac"  in  selected_user_agent  else  '"Linux"' 
139+                     ),
140+                 }
141+             )
142+         elif  is_firefox :
143+             # Firefox doesn't send sec-ch-ua headers 
144+             pass 
145+         elif  is_safari :
146+             # Safari has different sec headers 
147+             request_headers .update (
148+                 {
149+                     "Sec-Fetch-Site" : "same-origin" ,
150+                 }
151+             )
152+     results  =  []
153+     with  requests .Session () as  session :
154+         session .headers .update (request_headers )
155+         for  url  in  urls :
156+             if  url_exists (
157+                 url ,
158+                 session ,
159+                 request_timeout = request_timeout ,
160+                 allow_redirects = allow_redirects ,
161+                 ** kwargs ,
162+             ):
163+                 results .append (True )
164+             else :
165+                 # Retry using exponential backoff + some randomness so that we don't get a bunch of 
166+                 # threads or processes all performing queries in a synchronized fashion. 
167+                 # We could have used the Retry built in strategy 
168+                 # (https://requests.readthedocs.io/en/latest/user/advanced/#example-automatic-retries, 
169+                 # https://www.zenrows.com/blog/python-requests-retry#best-methods-to-retry) 
170+                 # but it isn't more succinct or efficient than what is implemented here. 
171+                 i  =  0 
172+                 not_successful  =  True 
173+                 while  i  <  retry_num  and  not_successful :
174+                     time .sleep (pow (base = 2 , exp = i ) +  np .random .random ())
175+                     if  url_exists (
176+                         url ,
177+                         session ,
178+                         request_timeout = request_timeout ,
179+                         allow_redirects = allow_redirects ,
180+                         ** kwargs ,
181+                     ):
182+                         results .append (True )
183+                         not_successful  =  False 
184+                     i  +=  1 
185+                 if  not_successful :
186+                     results .append (False )
187+ 
188+             # Add small delay between requests, especially for same domain 
189+             if  len (urls ) >  1 :
190+                 time .sleep (
191+                     random .uniform (delay_between_tries [0 ], delay_between_tries [1 ])
192+                 )
193+     return  results 
87194
88195
89196def  check_urls (
@@ -92,30 +199,56 @@ def check_urls(
92199    retry_num = 0 ,
93200    request_timeout = 10 ,
94201    allow_redirects = True ,
95-     request_headers = {
96-         "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"   # noqa E501 
97-     },
202+     request_headers = None ,
98203    ** kwargs ,
99204):
100205    """ 
101-     Check url existence for a number of urls in a container. It is assumed that the urls 
102-     are for different hosts (if they are on the same host then better to use a 
103-     requests.Session object). 
206+     Check url existence for the urls given in the container. 
207+ 
208+     This is a multi-threaded approach where each thread uses its own requests.Session 
209+     object. URLs are grouped by domain and each group is checked using an independent 
210+     thread and session so that multiple threads don't try to connect to the same host concurrently. 
211+     This is a best effort in terms of mimicking browser requests to avoid 403 errors, though these 
212+     are considered as successful. Unfortunately, this still does not pass the scrutiny of some 
213+     sites (possibly protected for Denial Of Service attacks) and they 
214+     return a 403 error. Timeout exceptions are also considered a success. 
215+     Finally, some sites return a 404 error even when they exist (https://www.sysy.com/). This 
216+     may be a server error or intentional when the site identifies that it is being queried by 
217+     a script/bot. 
218+ 
219+     Returns a list of boolean values indicating for each url in the container if it exists 
220+     or not. 
104221    """ 
222+ 
223+     results_dict  =  {}
224+     # Split the urls by domain to avoid multiple threads concurrently trying to 
225+     # connect to the same host. Each thread will use its own requests.Session. 
226+     domain_groups  =  defaultdict (list )
227+     for  url  in  urls_container :
228+         try :
229+             domain  =  urlparse (url ).netloc 
230+             domain_groups [domain ].append (url )
231+         except  Exception :
232+             # If URL parsing fails, then the URL is incorrect 
233+             results_dict [url ] =  False 
234+ 
105235    with  ThreadPoolExecutor (num_threads ) as  executor :
106-         return  list (
107-             executor .map (
108-                 lambda  x : url_exists_with_retries (
109-                     url = x ,
110-                     retry_num = retry_num ,
111-                     request_timeout = request_timeout ,
112-                     allow_redirects = allow_redirects ,
113-                     request_headers = request_headers ,
114-                     ** kwargs ,
115-                 ),
116-                 urls_container ,
117-             )
236+         dict_values  =  domain_groups .values ()
237+         res  =  executor .map (
238+             lambda  x : urls_exist_with_retries (
239+                 urls = x ,
240+                 retry_num = retry_num ,
241+                 request_timeout = request_timeout ,
242+                 allow_redirects = allow_redirects ,
243+                 request_headers = request_headers ,
244+                 ** kwargs ,
245+             ),
246+             dict_values ,
118247        )
248+         for  url_list , result_list  in  zip (dict_values , res ):
249+             for  url , result  in  zip (url_list , result_list ):
250+                 results_dict [url ] =  result 
251+     return  [results_dict [url ] for  url  in  urls_container ]
119252
120253
121254def  duplicated_entries_single_value_columns (df , column_names ):
0 commit comments