docs:Refactor HttpCrawler to conform to Google style docstring guidel…

…ines
apify · Oct 22, 2024 · f9524da · f9524da
1 parent 36d0f59
commit f9524da
Showing 1 changed file with 43 additions and 1 deletion.
diff --git a/src/crawlee/http_crawler/_http_crawler.py b/src/crawlee/http_crawler/_http_crawler.py
@@ -15,7 +15,29 @@
 
 
 class HttpCrawler(BasicCrawler[HttpCrawlingContext]):
-    """A crawler that fetches the request URL using `httpx`."""
+    """A crawler that fetches the request URL using `httpx`.
+
+    The `HttpCrawler` class extends `BasicCrawler` to perform web crawling tasks that involve HTTP requests.
+    It uses the `httpx` library for handling HTTP-based operations, supporting configurable error handling
+    and session management. The crawler can manage additional error status codes to trigger retries
+    and exclude specific codes that are generally treated as errors.
+
+    Usage:
+        ```python
+        from crawlee.http_crawler import HttpCrawler
+
+        # Instantiate and configure the HttpCrawler
+        crawler = HttpCrawler(
+            additional_http_error_status_codes=[500, 502],
+            ignore_http_error_status_codes=[404],
+            max_request_retries=3,
+            request_timeout_secs=30,
+        )
+
+        # Run the crawler to start fetching URLs
+        await crawler.run()
+        ```
+    """
 
     def __init__(
         self,
@@ -51,6 +73,15 @@ def __init__(
         super().__init__(**kwargs)
 
     async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
+        """Executes an HTTP request using the `httpx` client with the provided context parameters.
+
+        Args:
+            context: The crawling context containing request, session,
+                and other relevant parameters for the HTTP request.
+
+        Yields:
+            The context object, updated with the HTTP response details.
+        """
         result = await self._http_client.crawl(
             request=context.request,
             session=context.session,
@@ -70,6 +101,17 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
         )
 
     async def _handle_blocked_request(self, context: HttpCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
+        """Handles blocked requests by checking the HTTP status code and managing session behavior.
+
+        If a blocked status code is detected and the retry option is enabled,
+            the session is flagged as blocked to trigger a retry mechanism.
+
+        Args:
+            context: The crawling context containing the HTTP response and session information.
+
+        Yields:
+            The same context if no errors are detected, otherwise raises a `SessionError` to indicate a blocked session.
+        """
         if self._retry_on_blocked:
             status_code = context.http_response.status_code