Skip to content

Commit 3273da5

Browse files
authored
fix: exclude incorrect links before checking robots.txt (#1502)
### Description - exclude incorrect links, before checking `robots.txt` ### Issues - Closes: #1499
1 parent 457bbb4 commit 3273da5

File tree

6 files changed

+20
-8
lines changed

6 files changed

+20
-8
lines changed

src/crawlee/_utils/urls.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
if TYPE_CHECKING:
99
from collections.abc import Iterator
10+
from logging import Logger
1011

1112

1213
def is_url_absolute(url: str) -> bool:
@@ -22,13 +23,19 @@ def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
2223
return str(URL(base_url).join(URL(relative_url)))
2324

2425

25-
def to_absolute_url_iterator(base_url: str, urls: Iterator[str]) -> Iterator[str]:
26+
def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:
2627
"""Convert an iterator of relative URLs to absolute URLs using a base URL."""
2728
for url in urls:
2829
if is_url_absolute(url):
2930
yield url
3031
else:
31-
yield convert_to_absolute_url(base_url, url)
32+
converted_url = convert_to_absolute_url(base_url, url)
33+
# Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.
34+
if not is_url_absolute(converted_url):
35+
if logger:
36+
logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.')
37+
continue
38+
yield converted_url
3239

3340

3441
_http_url_adapter = TypeAdapter(AnyHttpUrl)

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,9 @@ async def extract_links(
167167
kwargs.setdefault('strategy', 'same-hostname')
168168

169169
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
170-
links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
170+
links_iterator = to_absolute_url_iterator(
171+
context.request.loaded_url or context.request.url, links_iterator, logger=context.log
172+
)
171173

172174
if robots_txt_file:
173175
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,9 @@ async def extract_links(
366366
links_iterator: Iterator[str] = iter(
367367
[url for element in elements if (url := await element.get_attribute('href')) is not None]
368368
)
369-
links_iterator = to_absolute_url_iterator(context.request.loaded_url or context.request.url, links_iterator)
369+
links_iterator = to_absolute_url_iterator(
370+
context.request.loaded_url or context.request.url, links_iterator, logger=context.log
371+
)
370372

371373
if robots_txt_file:
372374
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)

tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
2929

3030
assert handler.called
3131

32-
# The handler should find two links
33-
assert len(handler.call_args[0][0]) == 2
32+
# The handler should find three links
33+
assert len(handler.call_args[0][0]) == 3
3434

3535

3636
async def test_enqueue_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:

tests/unit/crawlers/_parsel/test_parsel_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ async def request_handler(context: ParselCrawlingContext) -> None:
3131

3232
assert handler.called
3333

34-
# The handler should find two links
35-
assert len(handler.call_args[0][0]) == 2
34+
# The handler should find three links
35+
assert len(handler.call_args[0][0]) == 3
3636

3737

3838
async def test_enqueue_links(redirect_server_url: URL, server_url: URL, http_client: HttpClient) -> None:

tests/unit/server_endpoints.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
<body>
1515
<a href="/sub_index" class="foo">Link 1</a>
1616
<a href="/page_1">Link 2</a>
17+
1718
</body></html>"""
1819

1920
SECONDARY_INDEX = b"""\

0 commit comments

Comments
 (0)