Skip to content

Commit 92f0081

Browse files
committed
Reduce function complexity, verify content can be read without fetching whole response
1 parent 4570e7f commit 92f0081

File tree

1 file changed

+54
-34
lines changed

1 file changed

+54
-34
lines changed

planemo/lint.py

Lines changed: 54 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,54 @@ def lint_xsd(lint_ctx, schema_path, path):
126126
lint_ctx.info("File validates against XML schema.")
127127

128128

129+
def _validate_doi_url(url, lint_ctx):
130+
"""Validate DOI URL by checking CrossRef API."""
131+
match = re.match("https?://doi.org/(.*)$", url)
132+
if match is None:
133+
return False
134+
135+
doi = match.group(1)
136+
xref_url = f"https://api.crossref.org/works/{doi}"
137+
try:
138+
requests.get(xref_url, timeout=5)
139+
return True
140+
except Exception as e:
141+
lint_ctx.error(f"Error '{e}' accessing {url}")
142+
return False
143+
144+
145+
def _validate_http_url(url, lint_ctx, user_agent=None):
146+
"""Validate HTTP/HTTPS URL."""
147+
headers = {"User-Agent": user_agent, "Accept": "*/*"} if user_agent else None
148+
r = None
149+
try:
150+
r = requests.get(url, headers=headers, stream=True)
151+
r.raise_for_status()
152+
next(r.iter_content(1000))
153+
return True
154+
except Exception as e:
155+
if r is not None and r.status_code == 429:
156+
# too many requests
157+
return True
158+
elif r is not None and r.status_code in [403, 503] and "cloudflare" in r.text:
159+
# CloudFlare protection block
160+
return True
161+
else:
162+
lint_ctx.error(f"Error '{e}' accessing {url}")
163+
return False
164+
165+
166+
def _validate_other_url(url, lint_ctx):
167+
"""Validate non-HTTP URLs."""
168+
try:
169+
with urlopen(url) as handle:
170+
handle.read(100)
171+
return True
172+
except Exception as e:
173+
lint_ctx.error(f"Error '{e}' accessing {url}")
174+
return False
175+
176+
129177
def lint_urls(root, lint_ctx):
130178
"""Find referenced URLs and verify they are valid."""
131179
urls, docs = find_urls_for_xml(root)
@@ -134,42 +182,14 @@ def lint_urls(root, lint_ctx):
134182
BROWSER_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
135183

136184
def validate_url(url, lint_ctx, user_agent=None):
137-
is_valid = True
138-
if (match := re.match("https?://doi.org/(.*)$", url)) is not None:
139-
doi = match.group(1)
140-
xref_url = f"https://api.crossref.org/works/{doi}"
141-
try:
142-
requests.get(xref_url, timeout=5)
143-
except Exception as e:
144-
is_valid = False
145-
lint_ctx.error(f"Error '{e}' accessing {url}")
185+
is_valid = False
186+
if re.match("https?://doi.org/(.*)$", url):
187+
is_valid = _validate_doi_url(url, lint_ctx)
146188
elif url.startswith("http://") or url.startswith("https://"):
147-
if user_agent:
148-
headers = {"User-Agent": user_agent, "Accept": "*/*"}
149-
else:
150-
headers = None
151-
r = None
152-
try:
153-
r = requests.get(url, headers=headers, stream=True)
154-
r.raise_for_status()
155-
next(r.iter_content(1000))
156-
except Exception as e:
157-
if r is not None and r.status_code == 429:
158-
# too many requests
159-
pass
160-
elif r is not None and r.status_code in [403, 503] and "cloudflare" in r.text:
161-
# CloudFlare protection block
162-
pass
163-
else:
164-
is_valid = False
165-
lint_ctx.error(f"Error '{e}' accessing {url}")
189+
is_valid = _validate_http_url(url, lint_ctx, user_agent)
166190
else:
167-
try:
168-
with urlopen(url) as handle:
169-
handle.read(100)
170-
except Exception as e:
171-
is_valid = False
172-
lint_ctx.error(f"Error '{e}' accessing {url}")
191+
is_valid = _validate_other_url(url, lint_ctx)
192+
173193
if is_valid:
174194
lint_ctx.info("URL OK %s" % url)
175195

0 commit comments

Comments
 (0)