@@ -126,6 +126,54 @@ def lint_xsd(lint_ctx, schema_path, path):
126126 lint_ctx .info ("File validates against XML schema." )
127127
128128
129+ def _validate_doi_url (url , lint_ctx ):
130+ """Validate DOI URL by checking CrossRef API."""
131+ match = re .match ("https?://doi.org/(.*)$" , url )
132+ if match is None :
133+ return False
134+
135+ doi = match .group (1 )
136+ xref_url = f"https://api.crossref.org/works/{ doi } "
137+ try :
138+ requests .get (xref_url , timeout = 5 )
139+ return True
140+ except Exception as e :
141+ lint_ctx .error (f"Error '{ e } ' accessing { url } " )
142+ return False
143+
144+
145+ def _validate_http_url (url , lint_ctx , user_agent = None ):
146+ """Validate HTTP/HTTPS URL."""
147+ headers = {"User-Agent" : user_agent , "Accept" : "*/*" } if user_agent else None
148+ r = None
149+ try :
150+ r = requests .get (url , headers = headers , stream = True )
151+ r .raise_for_status ()
152+ next (r .iter_content (1000 ))
153+ return True
154+ except Exception as e :
155+ if r is not None and r .status_code == 429 :
156+ # too many requests
157+ return True
158+ elif r is not None and r .status_code in [403 , 503 ] and "cloudflare" in r .text :
159+ # CloudFlare protection block
160+ return True
161+ else :
162+ lint_ctx .error (f"Error '{ e } ' accessing { url } " )
163+ return False
164+
165+
166+ def _validate_other_url (url , lint_ctx ):
167+ """Validate non-HTTP URLs."""
168+ try :
169+ with urlopen (url ) as handle :
170+ handle .read (100 )
171+ return True
172+ except Exception as e :
173+ lint_ctx .error (f"Error '{ e } ' accessing { url } " )
174+ return False
175+
176+
129177def lint_urls (root , lint_ctx ):
130178 """Find referenced URLs and verify they are valid."""
131179 urls , docs = find_urls_for_xml (root )
@@ -134,42 +182,14 @@ def lint_urls(root, lint_ctx):
134182 BROWSER_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
135183
136184 def validate_url (url , lint_ctx , user_agent = None ):
137- is_valid = True
138- if (match := re .match ("https?://doi.org/(.*)$" , url )) is not None :
139- doi = match .group (1 )
140- xref_url = f"https://api.crossref.org/works/{ doi } "
141- try :
142- requests .get (xref_url , timeout = 5 )
143- except Exception as e :
144- is_valid = False
145- lint_ctx .error (f"Error '{ e } ' accessing { url } " )
185+ is_valid = False
186+ if re .match ("https?://doi.org/(.*)$" , url ):
187+ is_valid = _validate_doi_url (url , lint_ctx )
146188 elif url .startswith ("http://" ) or url .startswith ("https://" ):
147- if user_agent :
148- headers = {"User-Agent" : user_agent , "Accept" : "*/*" }
149- else :
150- headers = None
151- r = None
152- try :
153- r = requests .get (url , headers = headers , stream = True )
154- r .raise_for_status ()
155- next (r .iter_content (1000 ))
156- except Exception as e :
157- if r is not None and r .status_code == 429 :
158- # too many requests
159- pass
160- elif r is not None and r .status_code in [403 , 503 ] and "cloudflare" in r .text :
161- # CloudFlare protection block
162- pass
163- else :
164- is_valid = False
165- lint_ctx .error (f"Error '{ e } ' accessing { url } " )
189+ is_valid = _validate_http_url (url , lint_ctx , user_agent )
166190 else :
167- try :
168- with urlopen (url ) as handle :
169- handle .read (100 )
170- except Exception as e :
171- is_valid = False
172- lint_ctx .error (f"Error '{ e } ' accessing { url } " )
191+ is_valid = _validate_other_url (url , lint_ctx )
192+
173193 if is_valid :
174194 lint_ctx .info ("URL OK %s" % url )
175195
0 commit comments