fix: exclude incorrect links before checking robots.txt (#1502)

Mantisus · web-flow · commit 3273da5fee62 · 2025-10-23T10:18:17.000+02:00
### Description - exclude incorrect links, before checking `robots.txt` ### Issues - Closes: #1499
diff --git a/src/crawlee/_utils/urls.py b/src/crawlee/_utils/urls.py
@@ -7,6 +7,7 @@
 
 if TYPE_CHECKING:
     from collections.abc import Iterator
+    from logging import Logger
 
 
 def is_url_absolute(url: str) -> bool:
@@ -22,13 +23,19 @@ def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
     return str(URL(base_url).join(URL(relative_url)))
 
 
-def to_absolute_url_iterator(base_url: str, urls: Iterator[str]) -> Iterator[str]:
+def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:
     """Convert an iterator of relative URLs to absolute URLs using a base URL."""
     for url in urls:
         if is_url_absolute(url):
             yield url
         else:
-            yield convert_to_absolute_url(base_url, url)
+            converted_url = convert_to_absolute_url(base_url, url)
+            # Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.
+            if not is_url_absolute(converted_url):
+                if logger:
+                    logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.')
+                continue
+            yield converted_url
 
 
 _http_url_adapter = TypeAdapter(AnyHttpUrl)
diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
@@ -167,7 +167,9 @@ async def extract_links(
             kwargs.setdefault('strategy', 'same-hostname')
 
             links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
-            links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
+            links_iterator = to_absolute_url_iterator(
+                context.request.loaded_url or context.request.url, links_iterator, logger=context.log
+            )
 
             if robots_txt_file:
                 skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -366,7 +366,9 @@ async def extract_links(
             links_iterator: Iterator[str] = iter(
                 [url for element in elements if (url := await element.get_attribute('href')) is not None]
             )
-            links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
+            links_iterator = to_absolute_url_iterator(
+                context.request.loaded_url or context.request.url, links_iterator, logger=context.log
+            )
 
             if robots_txt_file:
                 skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
@@ -29,8 +29,8 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
 
     assert handler.called
 
-    # The handler should find two links
-    assert len(handler.call_args[0][0]) == 2
+    # The handler should find three links
+    assert len(handler.call_args[0][0]) == 3
 
 
 async def test_enqueue_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:
diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py
@@ -31,8 +31,8 @@ async def request_handler(context: ParselCrawlingContext) -> None:
 
     assert handler.called
 
-    # The handler should find two links
-    assert len(handler.call_args[0][0]) == 2
+    # The handler should find three links
+    assert len(handler.call_args[0][0]) == 3
 
 
 async def test_enqueue_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:
diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py
@@ -14,6 +14,7 @@
 <body>
     <a href="/sub_index" class="foo">Link 1</a>
     <a href="/page_1">Link 2</a>
+    <a href="mailto:test@test.com">test@test.com</a>
 </body></html>"""
 
 SECONDARY_INDEX = b"""\