apify · belloibrahv · Oct 2, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024
diff --git a/src/crawlee/basic_crawler/_basic_crawler.py b/src/crawlee/basic_crawler/_basic_crawler.py
@@ -88,14 +88,56 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
 
 
 class BasicCrawler(Generic[TCrawlingContext]):
-    """Provides a simple framework for parallel crawling of web pages.
-
-    The URLs to crawl are fed either from a static list of URLs or from a dynamic queue of URLs enabling recursive
-    crawling of websites.
-
-    `BasicCrawler` is a low-level tool that requires the user to implement the page download and data extraction
-    functionality themselves. If we want a crawler that already facilitates this functionality, we should consider using
-    one of its subclasses.
+    """A versatile web crawler for parallel URL fetching with extensive features for web scraping.
+
+    BasicCrawler is a highly customizable and efficient web crawling solution that provides:
+    - Automatic scaling based on available system resources and target website's requirements
+    - Smart request routing and handling of different types of URLs
+    - Session management for maintaining persistent connections
+    - Proxy integration and rotation for avoiding IP-based blocking
+    - Automatic retries for failed requests with customizable settings
+    - Built-in statistics tracking for monitoring crawler performance
+    - Event system for fine-grained control over the crawling process
+
+    Basic usage:
+    ```python
+    async def handle_request(context: Context) -> None:
+        url = context.request.url
+        response = await context.send_request()
+        # Process the response here
+    crawler = BasicCrawler(
+        request_handler=handle_request,
+    )
+    await crawler.run()
+    ```
+
+    Advanced features:
+    - Configure concurrency to control parallel processing
+    - Set up automatic retry mechanisms for failed or blocked requests
+    - Utilize session pools for efficient connection management
+    - Integrate proxy services for IP rotation
+    - Monitor detailed statistics about the crawling process
+
+    Args:
+        request_provider: The provider for requests to be crawled.
+        request_handler: The function to handle individual requests.
+        http_client: The HTTP client for making web requests.
+        concurrency_settings: Settings for controlling concurrency.
+        max_request_retries: Maximum retry attempts for failed requests. Defaults to 3.
+        max_requests_per_crawl: Maximum number of requests per crawl. No limit by default.
+        max_session_rotations: Maximum session rotations per request. Defaults to 10.
+        configuration: Crawler configuration settings.
+        request_handler_timeout: Request handler timeout. Defaults to 1 minute.
+        use_session_pool: Whether to use session pool. Defaults to True.
+        session_pool: Preconfigured session pool.
+        retry_on_blocked: Whether to retry on blocked requests. Defaults to True.
+        proxy_configuration: Proxy configuration.
+        statistics: Statistics object.
+        event_manager: Event manager.
+        configure_logging: Whether to configure logging. Defaults to True.
+        _context_pipeline: Internal context pipeline.
+        _additional_context_managers: Additional context managers.
+        _logger: Logger instance.
     """
 
     def __init__(
@@ -121,34 +163,28 @@ def __init__(
         _additional_context_managers: Sequence[AsyncContextManager] | None = None,
         _logger: logging.Logger | None = None,
     ) -> None:
-        """Initialize the BasicCrawler.
+        """Initializes the BasicCrawler.
 
         Args:
-            request_provider: Provides requests to be processed
-            request_handler: A callable to which request handling is delegated
-            http_client: HTTP client to be used for `BasicCrawlingContext.send_request` and HTTP-only crawling.
-            concurrency_settings: Allows fine-tuning concurrency levels
-            max_request_retries: Maximum amount of attempts at processing a request
-            max_requests_per_crawl: Maximum number of pages that the crawler will open. The crawl will stop when
-                the limit is reached. It is recommended to set this value in order to prevent infinite loops in
-                misconfigured crawlers. None means no limit. Due to concurrency_settings, the actual number of pages
-                visited may slightly exceed this value.
-            max_session_rotations: Maximum number of session rotations per request.
-                The crawler will automatically rotate the session in case of a proxy error or if it gets blocked by
-                the website.
-            configuration: Crawler configuration
-            request_handler_timeout: How long is a single request handler allowed to run
-            use_session_pool: Enables using the session pool for crawling
-            session_pool: A preconfigured `SessionPool` instance if you wish to use non-default configuration
-            retry_on_blocked: If set to True, the crawler will try to automatically bypass any detected bot protection
-            proxy_configuration: A HTTP proxy configuration to be used for making requests
-            statistics: A preconfigured `Statistics` instance if you wish to use non-default configuration
-            event_manager: A custom `EventManager` instance if you wish to use a non-default one
-            configure_logging: If set to True, the crawler will configure the logging infrastructure
-            _context_pipeline: Allows extending the request lifecycle and modifying the crawling context.
-                This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly.
-            _additional_context_managers: Additional context managers to be used in the crawler lifecycle.
-            _logger: A logger instance passed from a child class to ensure consistent labels
+            request_provider: Request provider.
+            request_handler: Request handler function.
+            http_client: HTTP client.
+            concurrency_settings: Concurrency settings.
+            max_request_retries: Maximum retries. Defaults to 3.
+            max_requests_per_crawl: Maximum requests per crawl.
+            max_session_rotations: Maximum session rotations. Defaults to 10.
+            configuration: Crawler configuration.
+            request_handler_timeout: Request handler timeout. Defaults to 1 minute.
+            use_session_pool: Use session pool. Defaults to True.
+            session_pool: Preconfigured session pool.
+            retry_on_blocked: Retry on blocked requests. Defaults to True.
+            proxy_configuration: Proxy configuration.
+            statistics: Statistics object.
+            event_manager: Event manager.
+            configure_logging: Configure logging. Defaults to True.
+            _context_pipeline: Internal context pipeline.
+            _additional_context_managers: Additional context managers.
+            _logger: Logger instance.
         """
         self._router: Router[TCrawlingContext] | None = None
 
@@ -230,39 +266,40 @@ def __init__(
 
     @property
     def log(self) -> logging.Logger:
-        """The logger used by the crawler."""
+        """The logger used by the crawler for debugging and info-level messages."""
         return self._logger
 
     @property
     def router(self) -> Router[TCrawlingContext]:
-        """The router used to handle each individual crawling request."""
+        """Handles individual crawling requests. Initializes a router if none is set."""
         if self._router is None:
             self._router = Router[TCrawlingContext]()
 
         return self._router
 
     @router.setter
     def router(self, router: Router[TCrawlingContext]) -> None:
+        """Sets the router, ensuring only one instance is allowed."""
         if self._router is not None:
             raise RuntimeError('A router is already set')
 
         self._router = router
 
     @property
     def statistics(self) -> Statistics[StatisticsState]:
-        """Statistics about the current (or last) crawler run."""
+        """Returns statistics about the crawler's current or last run."""
         return self._statistics
 
     @property
     def _max_requests_count_exceeded(self) -> bool:
-        """Whether the maximum number of requests to crawl has been reached."""
+        """Checks if the max number of crawl requests has been reached."""
         if self._max_requests_per_crawl is None:
             return False
 
         return self._statistics.state.requests_finished >= self._max_requests_per_crawl
 
     async def _get_session(self) -> Session | None:
-        """If session pool is being used, try to take a session from it."""
+        """Fetches a session from the pool if session pooling is enabled."""
         if not self._use_session_pool:
             return None
 
@@ -276,7 +313,7 @@ async def _get_session(self) -> Session | None:
         )
 
     async def _get_proxy_info(self, request: Request, session: Session | None) -> ProxyInfo | None:
-        """Retrieve a new ProxyInfo object based on crawler configuration and the current request and session."""
+        """Returns a ProxyInfo object for a request based on the crawlers proxy settings."""
         if not self._proxy_configuration:
             return None
 
@@ -292,7 +329,7 @@ async def get_request_provider(
         id: str | None = None,
         name: str | None = None,
     ) -> RequestProvider:
-        """Return the configured request provider. If none is configured, open and return the default request queue."""
+        """Returns the configured request provider or opens the default request queue."""
         if not self._request_provider:
             self._request_provider = await RequestQueue.open(id=id, name=name, configuration=self._configuration)
 
@@ -304,7 +341,7 @@ async def get_dataset(
         id: str | None = None,
         name: str | None = None,
     ) -> Dataset:
-        """Return the dataset with the given ID or name. If none is provided, return the default dataset."""
+        """Opens and returns the dataset by ID or name, or opens the default dataset if none is provided."""
         return await Dataset.open(id=id, name=name, configuration=self._configuration)
 
     async def get_key_value_store(
@@ -313,20 +350,20 @@ async def get_key_value_store(
         id: str | None = None,
         name: str | None = None,
     ) -> KeyValueStore:
-        """Return the key-value store with the given ID or name. If none is provided, return the default KVS."""
+        """Opens and returns a key-value store by ID or name, or opens the default one if none is provided."""
         return await KeyValueStore.open(id=id, name=name, configuration=self._configuration)
 
     def error_handler(
         self, handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext]
     ) -> ErrorHandler[TCrawlingContext]:
-        """Decorator for configuring an error handler (called after a request handler error and before retrying)."""
+        """Sets a custom error handler for request retries in case of errors."""
         self._error_handler = handler
         return handler
 
     def failed_request_handler(
         self, handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext]
     ) -> FailedRequestHandler[TCrawlingContext]:
-        """Decorator for configuring a failed request handler (called after max retries are reached)."""
+        """Sets a handler for requests that fail after reaching max retries."""
         self._failed_request_handler = handler
         return handler
 
@@ -339,9 +376,11 @@ async def run(
         """Run the crawler until all requests are processed.
 
         Args:
-            requests: The requests to be enqueued before the crawler starts
-            purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default
-                request queue will be purged
+            requests: A list of requests to enqueue before the crawl starts.
+            purge_request_queue: Whether to clear the request queue before starting the crawler run.
+
+        Returns:
+            The final statistics about the crawler run.
         """
         if self._running:
             raise RuntimeError(

diff --git a/src/crawlee/http_crawler/_http_crawler.py b/src/crawlee/http_crawler/_http_crawler.py
@@ -15,7 +15,29 @@
 
 
 class HttpCrawler(BasicCrawler[HttpCrawlingContext]):
-    """A crawler that fetches the request URL using `httpx`."""
+    """A crawler that fetches the request URL using `httpx`.
+
+    The `HttpCrawler` class extends `BasicCrawler` to perform web crawling tasks that involve HTTP requests.
+    It uses the `httpx` library for handling HTTP-based operations, supporting configurable error handling
+    and session management. The crawler can manage additional error status codes to trigger retries
+    and exclude specific codes that are generally treated as errors.
+
+    Usage:
+        ```python
+        from crawlee.http_crawler import HttpCrawler
+
+        # Instantiate and configure the HttpCrawler
+        crawler = HttpCrawler(
+            additional_http_error_status_codes=[500, 502],
+            ignore_http_error_status_codes=[404],
+            max_request_retries=3,
+            request_timeout_secs=30,
+        )
+
+        # Run the crawler to start fetching URLs
+        await crawler.run()
+        ```
+    """
 
     def __init__(
         self,
@@ -51,6 +73,15 @@ def __init__(
         super().__init__(**kwargs)
 
     async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
+        """Executes an HTTP request using the `httpx` client with the provided context parameters.
+
+        Args:
+            context: The crawling context containing request, session,
+                and other relevant parameters for the HTTP request.
+
+        Yields:
+            The context object, updated with the HTTP response details.
+        """
         result = await self._http_client.crawl(
             request=context.request,
             session=context.session,
@@ -70,6 +101,17 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
         )
 
     async def _handle_blocked_request(self, context: HttpCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
+        """Handles blocked requests by checking the HTTP status code and managing session behavior.
+
+        If a blocked status code is detected and the retry option is enabled,
+            the session is flagged as blocked to trigger a retry mechanism.
+
+        Args:
+            context: The crawling context containing the HTTP response and session information.
+
+        Yields:
+            The same context if no errors are detected, otherwise raises a `SessionError` to indicate a blocked session.
+        """
         if self._retry_on_blocked:
             status_code = context.http_response.status_code