Skip to content

Commit

Permalink
docs:Refactor HttpCrawler to conform to Google style docstring guidel…
Browse files Browse the repository at this point in the history
…ines
  • Loading branch information
belloibrahv committed Oct 22, 2024
1 parent 36d0f59 commit f9524da
Showing 1 changed file with 43 additions and 1 deletion.
44 changes: 43 additions & 1 deletion src/crawlee/http_crawler/_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,29 @@


class HttpCrawler(BasicCrawler[HttpCrawlingContext]):
"""A crawler that fetches the request URL using `httpx`."""
"""A crawler that fetches the request URL using `httpx`.
The `HttpCrawler` class extends `BasicCrawler` to perform web crawling tasks that involve HTTP requests.
It uses the `httpx` library for handling HTTP-based operations, supporting configurable error handling
and session management. The crawler can manage additional error status codes to trigger retries
and exclude specific codes that are generally treated as errors.
Usage:
```python
from crawlee.http_crawler import HttpCrawler
# Instantiate and configure the HttpCrawler
crawler = HttpCrawler(
additional_http_error_status_codes=[500, 502],
ignore_http_error_status_codes=[404],
max_request_retries=3,
request_timeout_secs=30,
)
# Run the crawler to start fetching URLs
await crawler.run()
```
"""

def __init__(
self,
Expand Down Expand Up @@ -51,6 +73,15 @@ def __init__(
super().__init__(**kwargs)

async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
"""Executes an HTTP request using the `httpx` client with the provided context parameters.
Args:
context: The crawling context containing request, session,
and other relevant parameters for the HTTP request.
Yields:
The context object, updated with the HTTP response details.
"""
result = await self._http_client.crawl(
request=context.request,
session=context.session,
Expand All @@ -70,6 +101,17 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
)

async def _handle_blocked_request(self, context: HttpCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
"""Handles blocked requests by checking the HTTP status code and managing session behavior.
If a blocked status code is detected and the retry option is enabled,
the session is flagged as blocked to trigger a retry mechanism.
Args:
context: The crawling context containing the HTTP response and session information.
Yields:
The same context if no errors are detected, otherwise raises a `SessionError` to indicate a blocked session.
"""
if self._retry_on_blocked:
status_code = context.http_response.status_code

Expand Down

0 comments on commit f9524da

Please sign in to comment.