diff --git a/docs/guides/code/storages/rq_with_crawler_example.py b/docs/guides/code/storages/rq_with_crawler_example.py index 0f39cb5a7f..06895d89ce 100644 --- a/docs/guides/code/storages/rq_with_crawler_example.py +++ b/docs/guides/code/storages/rq_with_crawler_example.py @@ -5,7 +5,7 @@ async def main() -> None: # Create a new crawler (it can be any subclass of BasicCrawler). Request queue is a default - # request provider, it will be opened, and fully managed if not specified. + # request manager, it will be opened, and fully managed if not specified. crawler = HttpCrawler() # Define the default request handler, which will be called for every request. diff --git a/docs/guides/code/storages/rq_with_crawler_explicit_example.py b/docs/guides/code/storages/rq_with_crawler_explicit_example.py index 349dfe5139..41e6e72c18 100644 --- a/docs/guides/code/storages/rq_with_crawler_explicit_example.py +++ b/docs/guides/code/storages/rq_with_crawler_explicit_example.py @@ -13,7 +13,7 @@ async def main() -> None: await request_queue.add_requests_batched(['https://apify.com/', 'https://crawlee.dev/']) # Create a new crawler (it can be any subclass of BasicCrawler) and pass the request - # list as request provider to it. It will be managed by the crawler. + # list as request manager to it. It will be managed by the crawler. crawler = HttpCrawler(request_manager=request_queue) # Define the default request handler, which will be called for every request. diff --git a/src/crawlee/_autoscaling/snapshotter.py b/src/crawlee/_autoscaling/snapshotter.py index a62d5e4df3..26669096b6 100644 --- a/src/crawlee/_autoscaling/snapshotter.py +++ b/src/crawlee/_autoscaling/snapshotter.py @@ -102,10 +102,10 @@ def __init__( @classmethod def from_config(cls, config: Configuration | None = None) -> Snapshotter: - """Create a new instance based on the provided configuration. + """Create a new instance based on the provided `Configuration`. Args: - config: The configuration object. Uses the global (default) configuration if not provided. + config: The `Configuration` instance. Uses the global (default) one if not provided. """ config = service_locator.get_configuration() @@ -132,7 +132,7 @@ def _get_sorted_list_by_created_at(input_list: list[T]) -> SortedList[T]: @property def active(self) -> bool: - """Indicates whether the context is active.""" + """Indicate whether the context is active.""" return self._active async def __aenter__(self) -> Snapshotter: @@ -178,7 +178,7 @@ async def __aexit__( @ensure_context def get_memory_sample(self, duration: timedelta | None = None) -> list[Snapshot]: - """Returns a sample of the latest memory snapshots. + """Return a sample of the latest memory snapshots. Args: duration: The duration of the sample from the latest snapshot. If omitted, it returns a full history. @@ -191,7 +191,7 @@ def get_memory_sample(self, duration: timedelta | None = None) -> list[Snapshot] @ensure_context def get_event_loop_sample(self, duration: timedelta | None = None) -> list[Snapshot]: - """Returns a sample of the latest event loop snapshots. + """Return a sample of the latest event loop snapshots. Args: duration: The duration of the sample from the latest snapshot. If omitted, it returns a full history. @@ -204,7 +204,7 @@ def get_event_loop_sample(self, duration: timedelta | None = None) -> list[Snaps @ensure_context def get_cpu_sample(self, duration: timedelta | None = None) -> list[Snapshot]: - """Returns a sample of the latest CPU snapshots. + """Return a sample of the latest CPU snapshots. Args: duration: The duration of the sample from the latest snapshot. If omitted, it returns a full history. @@ -217,7 +217,7 @@ def get_cpu_sample(self, duration: timedelta | None = None) -> list[Snapshot]: @ensure_context def get_client_sample(self, duration: timedelta | None = None) -> list[Snapshot]: - """Returns a sample of the latest client snapshots. + """Return a sample of the latest client snapshots. Args: duration: The duration of the sample from the latest snapshot. If omitted, it returns a full history. @@ -230,7 +230,7 @@ def get_client_sample(self, duration: timedelta | None = None) -> list[Snapshot] @staticmethod def _get_sample(snapshots: list[Snapshot], duration: timedelta | None = None) -> list[Snapshot]: - """Returns a time-limited sample from snapshots or full history if duration is None.""" + """Return a time-limited sample from snapshots or full history if duration is None.""" if not duration: return snapshots diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 6d277e1690..1aa8609d77 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -38,6 +38,7 @@ else: from pydantic import JsonValue as JsonSerializable +T = TypeVar('T') HttpMethod: TypeAlias = Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH'] @@ -51,6 +52,7 @@ def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]: return dict(sorted_headers) +@docs_group('Data structures') class HttpHeaders(RootModel, Mapping[str, str]): """A dictionary-like object representing HTTP headers.""" @@ -149,6 +151,7 @@ def __init__( self.max_tasks_per_minute = max_tasks_per_minute +@docs_group('Data structures') class StorageTypes(str, Enum): """Possible Crawlee storage types.""" @@ -180,134 +183,19 @@ class AddRequestsKwargs(EnqueueLinksKwargs): """Keyword arguments for the `add_requests` methods.""" requests: Sequence[str | BaseRequestData | Request] - """Requests to be added to the request provider.""" - - -class AddRequestsFunction(Protocol): - """Type of a function for adding URLs to the request queue with optional filtering. - - This helper method simplifies the process of adding requests to the request provider. - It opens the specified request provider and adds the requests to it. - """ - - def __call__( - self, - requests: Sequence[str | BaseRequestData | Request], - **kwargs: Unpack[EnqueueLinksKwargs], - ) -> Coroutine[None, None, None]: ... - - -class GetDataFunction(Protocol): - """Type of a function for getting data from the dataset. - - This helper method simplifies the process of retrieving data from a dataset. It opens the specified - dataset and then retrieves the data based on the provided parameters. - """ - - def __call__( - self, - dataset_id: str | None = None, - dataset_name: str | None = None, - **kwargs: Unpack[GetDataKwargs], - ) -> Coroutine[None, None, DatasetItemsListPage]: ... + """Requests to be added to the `RequestManager`.""" class PushDataKwargs(TypedDict): """Keyword arguments for dataset's `push_data` method.""" -class PushDataFunction(Protocol): - """Type of a function for pushing data to the dataset. - - This helper method simplifies the process of pushing data to a dataset. It opens the specified - dataset and then pushes the provided data to it. - """ - - def __call__( - self, - data: JsonSerializable, - dataset_id: str | None = None, - dataset_name: str | None = None, - **kwargs: Unpack[PushDataKwargs], - ) -> Coroutine[None, None, None]: ... - - class PushDataFunctionCall(PushDataKwargs): data: JsonSerializable dataset_id: str | None dataset_name: str | None -class ExportToFunction(Protocol): - """Type of a function for exporting data from a dataset. - - This helper method simplifies the process of exporting data from a dataset. It opens the specified - dataset and then exports its content to the key-value store. - """ - - def __call__( - self, - dataset_id: str | None = None, - dataset_name: str | None = None, - **kwargs: Unpack[ExportToKwargs], - ) -> Coroutine[None, None, None]: ... - - -class EnqueueLinksFunction(Protocol): - """A function type for enqueueing new URLs to crawl, based on elements selected by a selector. - - This function is used to extract and enqueue new URLs from the current page for further crawling. - """ - - def __call__( - self, - *, - selector: str = 'a', - label: str | None = None, - user_data: dict[str, Any] | None = None, - **kwargs: Unpack[EnqueueLinksKwargs], - ) -> Coroutine[None, None, None]: - """A call dunder method. - - Args: - selector: selector used to find the elements containing the links. - - PlaywrightCrawler: supports CSS and XPath selectors - - ParselCrawler: supports CSS selectors - - BeautifulSoupCrawler: supports CSS selectors - label: Label for the newly created `Request` objects, used for request routing. - user_data: User data to be provided to the newly created `Request` objects. - **kwargs: Additional arguments for the `add_requests` method. - """ - - -class SendRequestFunction(Protocol): - """Type of a function for performing an HTTP request.""" - - def __call__( - self, - url: str, - *, - method: HttpMethod = 'GET', - headers: HttpHeaders | dict[str, str] | None = None, - ) -> Coroutine[None, None, HttpResponse]: ... - - -class UseStateFunction(Protocol): - """Type of a function for performing use state. - - Warning: - This is an experimental feature. The behavior and interface may change in future versions. - """ - - def __call__( - self, - default_value: dict[str, JsonSerializable] | None = None, - ) -> Coroutine[None, None, dict[str, JsonSerializable]]: ... - - -T = TypeVar('T') - - class KeyValueStoreInterface(Protocol): """The (limited) part of the `KeyValueStore` interface that should be accessible from a request handler.""" @@ -330,33 +218,6 @@ async def set_value( ) -> None: ... -class GetKeyValueStoreFromRequestHandlerFunction(Protocol): - """Type of a function for accessing a key-value store from within a request handler.""" - - def __call__( - self, - *, - id: str | None = None, - name: str | None = None, - ) -> Coroutine[None, None, KeyValueStoreInterface]: ... - - -@dataclass(frozen=True) -@docs_group('Data structures') -class BasicCrawlingContext: - """Basic crawling context intended to be extended by crawlers.""" - - request: Request - session: Session | None - proxy_info: ProxyInfo | None - send_request: SendRequestFunction - add_requests: AddRequestsFunction - push_data: PushDataFunction - use_state: UseStateFunction - get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction - log: logging.Logger - - @dataclass() class KeyValueStoreValue: content: Any @@ -392,17 +253,6 @@ async def get_value(self, key: str, default_value: T | None = None) -> T | None: return await self._actual_key_value_store.get_value(key, default_value) -class GetKeyValueStoreFunction(Protocol): - """Type of a function for accessing the live implementation of a key-value store.""" - - def __call__( - self, - *, - id: str | None = None, - name: str | None = None, - ) -> Coroutine[None, None, KeyValueStore]: ... - - class RequestHandlerRunResult: """Record of calls to storage-related context helpers.""" @@ -453,3 +303,258 @@ async def get_key_value_store( ) return self.key_value_store_changes[id, name] + + +@docs_group('Functions') +class AddRequestsFunction(Protocol): + """Function for adding requests to the `RequestManager`, with optional filtering. + + It simplifies the process of adding requests to the `RequestManager`. It automatically opens + the specified one and adds the provided requests. + """ + + def __call__( + self, + requests: Sequence[str | BaseRequestData | Request], + **kwargs: Unpack[EnqueueLinksKwargs], + ) -> Coroutine[None, None, None]: + """Call dunder method. + + Args: + requests: Requests to be added to the `RequestManager`. + **kwargs: Additional keyword arguments. + """ + + +@docs_group('Functions') +class EnqueueLinksFunction(Protocol): + """A function for enqueueing new URLs to crawl based on elements selected by a given selector. + + It extracts URLs from the current page and enqueues them for further crawling. It allows filtering through + selectors and other options. You can also specify labels and user data to be associated with the newly + created `Request` objects. + """ + + def __call__( + self, + *, + selector: str = 'a', + label: str | None = None, + user_data: dict[str, Any] | None = None, + **kwargs: Unpack[EnqueueLinksKwargs], + ) -> Coroutine[None, None, None]: + """A call dunder method. + + Args: + selector: A selector used to find the elements containing the links. The behaviour differs based + on the crawler used: + - `PlaywrightCrawler` supports CSS and XPath selectors. + - `ParselCrawler` supports CSS selectors. + - `BeautifulSoupCrawler` supports CSS selectors. + label: Label for the newly created `Request` objects, used for request routing. + user_data: User data to be provided to the newly created `Request` objects. + **kwargs: Additional keyword arguments. + """ + + +@docs_group('Functions') +class ExportToFunction(Protocol): + """A function for exporting data from a `Dataset`. + + It simplifies the process of exporting data from a `Dataset`. It opens the specified one and exports + its content to a `KeyValueStore`. + """ + + def __call__( + self, + dataset_id: str | None = None, + dataset_name: str | None = None, + **kwargs: Unpack[ExportToKwargs], + ) -> Coroutine[None, None, None]: + """Call dunder method. + + Args: + dataset_id: The ID of the `Dataset` to export data from. + dataset_name: The name of the `Dataset` to export data from. + **kwargs: Additional keyword arguments. + """ + + +@docs_group('Functions') +class GetDataFunction(Protocol): + """A function for retrieving data from a `Dataset`. + + It simplifies the process of accessing data from a `Dataset`. It opens the specified one and retrieves + data based on the provided parameters. It allows filtering and pagination. + """ + + def __call__( + self, + dataset_id: str | None = None, + dataset_name: str | None = None, + **kwargs: Unpack[GetDataKwargs], + ) -> Coroutine[None, None, DatasetItemsListPage]: + """Call dunder method. + + Args: + dataset_id: ID of the `Dataset` to get data from. + dataset_name: Name of the `Dataset` to get data from. + **kwargs: Additional keyword arguments. + + Returns: + A page of retrieved items. + """ + + +@docs_group('Functions') +class GetKeyValueStoreFunction(Protocol): + """A function for accessing a `KeyValueStore`. + + It retrieves an instance of a `KeyValueStore` based on its ID or name. + """ + + def __call__( + self, + *, + id: str | None = None, + name: str | None = None, + ) -> Coroutine[None, None, KeyValueStore]: + """Call dunder method. + + Args: + id: The ID of the `KeyValueStore` to get. + name: The name of the `KeyValueStore` to get. + """ + + +class GetKeyValueStoreFromRequestHandlerFunction(Protocol): + """A function for accessing a `KeyValueStore`. + + It retrieves an instance of a `KeyValueStore` based on its ID or name. + """ + + def __call__( + self, + *, + id: str | None = None, + name: str | None = None, + ) -> Coroutine[None, None, KeyValueStoreInterface]: + """Call dunder method. + + Args: + id: The ID of the `KeyValueStore` to get. + name: The name of the `KeyValueStore` to get. + """ + + +@docs_group('Functions') +class PushDataFunction(Protocol): + """A function for pushing data to a `Dataset`. + + It simplifies the process of adding data to a `Dataset`. It opens the specified one and pushes + the provided data to it. + """ + + def __call__( + self, + data: JsonSerializable, + dataset_id: str | None = None, + dataset_name: str | None = None, + **kwargs: Unpack[PushDataKwargs], + ) -> Coroutine[None, None, None]: + """Call dunder method. + + Args: + data: The data to push to the `Dataset`. + dataset_id: The ID of the `Dataset` to push the data to. + dataset_name: The name of the `Dataset` to push the data to. + **kwargs: Additional keyword arguments. + """ + + +@docs_group('Functions') +class SendRequestFunction(Protocol): + """A function for sending HTTP requests. + + It simplifies the process of sending HTTP requests. It is implemented by the crawling context and is used + within request handlers to send additional HTTP requests to target URLs. + """ + + def __call__( + self, + url: str, + *, + method: HttpMethod = 'GET', + headers: HttpHeaders | dict[str, str] | None = None, + ) -> Coroutine[None, None, HttpResponse]: + """A call dunder method. + + Args: + url: The URL to send the request to. + method: The HTTP method to use. + headers: The headers to include in the request. + + Returns: + The HTTP response received from the server. + """ + + +@docs_group('Functions') +class UseStateFunction(Protocol): + """A function for managing state within the crawling context. + + It allows the use of persistent state across multiple crawls. + + Warning: + This is an experimental feature. The behavior and interface may change in future versions. + """ + + def __call__( + self, + default_value: dict[str, JsonSerializable] | None = None, + ) -> Coroutine[None, None, dict[str, JsonSerializable]]: + """Call dunder method. + + Args: + default_value: The default value to initialize the state if it is not already set. + + Returns: + The current state. + """ + + +@dataclass(frozen=True) +@docs_group('Data structures') +class BasicCrawlingContext: + """Basic crawling context. + + It represents the fundamental crawling context used by the `BasicCrawler`. It is extended by more + specific crawlers to provide additional functionality. + """ + + request: Request + """Request object for the current page being processed.""" + + session: Session | None + """Session object for the current page being processed.""" + + proxy_info: ProxyInfo | None + """Proxy information for the current page being processed.""" + + send_request: SendRequestFunction + """Send request crawling context helper function.""" + + add_requests: AddRequestsFunction + """Add requests crawling context helper function.""" + + push_data: PushDataFunction + """Push data crawling context helper function.""" + + use_state: UseStateFunction + """Use state crawling context helper function.""" + + get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction + """Get key-value store crawling context helper function.""" + + log: logging.Logger + """Logger instance.""" diff --git a/src/crawlee/_utils/http.py b/src/crawlee/_utils/http.py index 9754d95275..2624383abf 100644 --- a/src/crawlee/_utils/http.py +++ b/src/crawlee/_utils/http.py @@ -2,10 +2,10 @@ def is_status_code_client_error(value: int) -> bool: - """Returns `True` for 4xx status codes, `False` otherwise.""" + """Return `True` for 4xx status codes, `False` otherwise.""" return 400 <= value <= 499 # noqa: PLR2004 def is_status_code_server_error(value: int) -> bool: - """Returns `True` for 5xx status codes, `False` otherwise.""" + """Return `True` for 5xx status codes, `False` otherwise.""" return value >= 500 # noqa: PLR2004 diff --git a/src/crawlee/browsers/_base_browser_controller.py b/src/crawlee/browsers/_base_browser_controller.py index f20b418ba8..2290ec70ba 100644 --- a/src/crawlee/browsers/_base_browser_controller.py +++ b/src/crawlee/browsers/_base_browser_controller.py @@ -29,7 +29,7 @@ def pages(self) -> list[Page]: @property @abstractmethod def pages_count(self) -> int: - """Returns the number of currently open pages.""" + """Return the number of currently open pages.""" @property @abstractmethod diff --git a/src/crawlee/browsers/_base_browser_plugin.py b/src/crawlee/browsers/_base_browser_plugin.py index 9f08310081..5636993d45 100644 --- a/src/crawlee/browsers/_base_browser_plugin.py +++ b/src/crawlee/browsers/_base_browser_plugin.py @@ -26,7 +26,7 @@ class BaseBrowserPlugin(ABC): @property @abstractmethod def active(self) -> bool: - """Indicates whether the context is active.""" + """Indicate whether the context is active.""" @property @abstractmethod diff --git a/src/crawlee/browsers/_browser_pool.py b/src/crawlee/browsers/_browser_pool.py index 4a99aa6629..f5c9d2da2f 100644 --- a/src/crawlee/browsers/_browser_pool.py +++ b/src/crawlee/browsers/_browser_pool.py @@ -30,9 +30,9 @@ @docs_group('Classes') class BrowserPool: - """Manages a pool of browsers and their pages, handling lifecycle events and resource allocation. + """Manage a pool of browsers and pages, handling their lifecycle and resource allocation. - This class is responsible for opening and closing browsers, managing pages within those browsers, + The `BrowserPool` is responsible for opening and closing browsers, managing pages within those browsers, and handling the overall lifecycle of these resources. It provides flexible configuration via constructor options, which include various hooks that allow for the insertion of custom behavior at different stages of the browser and page lifecycles. @@ -153,12 +153,12 @@ def pages(self) -> Mapping[str, CrawleePage]: @property def total_pages_count(self) -> int: - """Returns the total number of pages opened since the browser pool was launched.""" + """Return the total number of pages opened since the browser pool was launched.""" return self._total_pages_count @property def active(self) -> bool: - """Indicates whether the context is active.""" + """Indicate whether the context is active.""" return self._active async def __aenter__(self) -> BrowserPool: @@ -218,7 +218,7 @@ async def new_page( browser_plugin: BaseBrowserPlugin | None = None, proxy_info: ProxyInfo | None = None, ) -> CrawleePage: - """Opens a new page in a browser using the specified or a random browser plugin. + """Open a new page in a browser using the specified or a random browser plugin. Args: page_id: The ID to assign to the new page. If not provided, a random ID is generated. diff --git a/src/crawlee/browsers/_playwright_browser_controller.py b/src/crawlee/browsers/_playwright_browser_controller.py index 2c7417cf79..639f00f86e 100644 --- a/src/crawlee/browsers/_playwright_browser_controller.py +++ b/src/crawlee/browsers/_playwright_browser_controller.py @@ -29,8 +29,8 @@ class PlaywrightBrowserController(BaseBrowserController): """Controller for managing Playwright browser instances and their pages. - This class provides methods to manage pages within a browser instance, ensuring that the number - of open pages does not exceed the specified limit and tracking the state of the pages. + It provides methods to control browser instances, manage their pages, and handle context-specific + configurations. It enforces limits on the number of open pages and tracks their state. """ AUTOMATION_LIBRARY = 'playwright' diff --git a/src/crawlee/browsers/_playwright_browser_plugin.py b/src/crawlee/browsers/_playwright_browser_plugin.py index a8a4867c6f..770e865e18 100644 --- a/src/crawlee/browsers/_playwright_browser_plugin.py +++ b/src/crawlee/browsers/_playwright_browser_plugin.py @@ -27,7 +27,11 @@ class PlaywrightBrowserPlugin(BaseBrowserPlugin): """A plugin for managing Playwright automation library. - It should work as a factory for creating new browser instances. + It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory + for creating new browser instances and provides a unified interface for interacting with different browser types + (chromium, firefox, and webkit). This class integrates configuration options for browser launches (headless mode, + executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each + browser instance, ensuring that resource limits are respected. """ AUTOMATION_LIBRARY = 'playwright' diff --git a/src/crawlee/configuration.py b/src/crawlee/configuration.py index 5b6921eda4..ae29b27e8f 100644 --- a/src/crawlee/configuration.py +++ b/src/crawlee/configuration.py @@ -86,7 +86,7 @@ class Configuration(BaseSettings): ) ), ] = 'default' - """The default dataset ID. This option is utilized by the storage client.""" + """The default `Dataset` ID. This option is utilized by the storage client.""" default_key_value_store_id: Annotated[ str, @@ -98,7 +98,7 @@ class Configuration(BaseSettings): ) ), ] = 'default' - """The default key-value store ID. This option is utilized by the storage client.""" + """The default `KeyValueStore` ID. This option is utilized by the storage client.""" default_request_queue_id: Annotated[ str, @@ -110,7 +110,7 @@ class Configuration(BaseSettings): ) ), ] = 'default' - """The default request queue ID. This option is utilized by the storage client.""" + """The default `RequestQueue` ID. This option is utilized by the storage client.""" purge_on_start: Annotated[ bool, diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 35400fbe43..b1c6cd5919 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -71,7 +71,7 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]): """ configuration: NotRequired[Configuration] - """The configuration object. Some of its properties are used as defaults for the crawler.""" + """The `Configuration` instance. Some of its properties are used as defaults for the crawler.""" event_manager: NotRequired[EventManager] """The event manager for managing events for the crawler and all its components.""" @@ -203,7 +203,7 @@ def __init__( """A default constructor. Args: - configuration: The configuration object. Some of its properties are used as defaults for the crawler. + configuration: The `Configuration` instance. Some of its properties are used as defaults for the crawler. event_manager: The event manager for managing events for the crawler and all its components. storage_client: The storage client for managing storages for the crawler and all its components. request_manager: Manager of requests that should be processed by the crawler. @@ -328,7 +328,7 @@ def log(self) -> logging.Logger: @property def router(self) -> Router[TCrawlingContext]: - """The router used to handle each individual crawling request.""" + """The `Router` used to handle each individual crawling request.""" if self._router is None: self._router = Router[TCrawlingContext]() @@ -393,7 +393,7 @@ async def _get_proxy_info(self, request: Request, session: Session | None) -> Pr ) async def get_request_manager(self) -> RequestManager: - """Return the configured request provider. If none is configured, open and return the default request queue.""" + """Return the configured request manager. If none is configured, open and return the default request queue.""" if not self._request_manager: self._request_manager = await RequestQueue.open() @@ -405,7 +405,7 @@ async def get_dataset( id: str | None = None, name: str | None = None, ) -> Dataset: - """Return the dataset with the given ID or name. If none is provided, return the default dataset.""" + """Return the `Dataset` with the given ID or name. If none is provided, return the default one.""" return await Dataset.open(id=id, name=name) async def get_key_value_store( @@ -414,7 +414,7 @@ async def get_key_value_store( id: str | None = None, name: str | None = None, ) -> KeyValueStore: - """Return the key-value store with the given ID or name. If none is provided, return the default KVS.""" + """Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS.""" return await KeyValueStore.open(id=id, name=name) def error_handler( @@ -543,7 +543,7 @@ async def add_requests( wait_for_all_requests_to_be_added: bool = False, wait_for_all_requests_to_be_added_timeout: timedelta | None = None, ) -> None: - """Add requests to the underlying request provider in batches. + """Add requests to the underlying request manager in batches. Args: requests: A list of requests to add to the queue. @@ -576,15 +576,15 @@ async def get_data( dataset_name: str | None = None, **kwargs: Unpack[GetDataKwargs], ) -> DatasetItemsListPage: - """Retrieve data from a dataset. + """Retrieve data from a `Dataset`. - This helper method simplifies the process of retrieving data from a dataset. It opens the specified - dataset and then retrieves the data based on the provided parameters. + This helper method simplifies the process of retrieving data from a `Dataset`. It opens the specified + one and then retrieves the data based on the provided parameters. Args: - dataset_id: The ID of the dataset. - dataset_name: The name of the dataset. - kwargs: Keyword arguments to be passed to the dataset's `get_data` method. + dataset_id: The ID of the `Dataset`. + dataset_name: The name of the `Dataset`. + kwargs: Keyword arguments to be passed to the `Dataset.get_data()` method. Returns: The retrieved data. @@ -598,16 +598,16 @@ async def export_data( dataset_id: str | None = None, dataset_name: str | None = None, ) -> None: - """Export data from a dataset. + """Export data from a `Dataset`. - This helper method simplifies the process of exporting data from a dataset. It opens the specified - dataset and then exports the data based on the provided parameters. If you need to pass options + This helper method simplifies the process of exporting data from a `Dataset`. It opens the specified + one and then exports the data based on the provided parameters. If you need to pass options specific to the output format, use the `export_data_csv` or `export_data_json` method instead. Args: path: The destination path. - dataset_id: The ID of the dataset. - dataset_name: The name of the dataset. + dataset_id: The ID of the `Dataset`. + dataset_name: The name of the `Dataset`. """ dataset = await self.get_dataset(id=dataset_id, name=dataset_name) @@ -629,16 +629,16 @@ async def export_data_csv( dataset_name: str | None = None, **kwargs: Unpack[ExportDataCsvKwargs], ) -> None: - """Export data from a dataset to a CSV file. + """Export data from a `Dataset` to a CSV file. - This helper method simplifies the process of exporting data from a dataset in csv format. It opens the specified - dataset and then exports the data based on the provided parameters. + This helper method simplifies the process of exporting data from a `Dataset` in csv format. It opens + the specified one and then exports the data based on the provided parameters. Args: path: The destination path. content_type: The output format. - dataset_id: The ID of the dataset. - dataset_name: The name of the dataset. + dataset_id: The ID of the `Dataset`. + dataset_name: The name of the `Dataset`. kwargs: Extra configurations for dumping/writing in csv format. """ dataset = await self.get_dataset(id=dataset_id, name=dataset_name) @@ -654,15 +654,15 @@ async def export_data_json( dataset_name: str | None = None, **kwargs: Unpack[ExportDataJsonKwargs], ) -> None: - """Export data from a dataset to a JSON file. + """Export data from a `Dataset` to a JSON file. - This helper method simplifies the process of exporting data from a dataset in json format. It opens the - specified dataset and then exports the data based on the provided parameters. + This helper method simplifies the process of exporting data from a `Dataset` in json format. It opens the + specified one and then exports the data based on the provided parameters. Args: path: The destination path - dataset_id: The ID of the dataset. - dataset_name: The name of the dataset. + dataset_id: The ID of the `Dataset`. + dataset_name: The name of the `Dataset`. kwargs: Extra configurations for dumping/writing in json format. """ dataset = await self.get_dataset(id=dataset_id, name=dataset_name) @@ -677,16 +677,16 @@ async def _push_data( dataset_name: str | None = None, **kwargs: Unpack[PushDataKwargs], ) -> None: - """Push data to a dataset. + """Push data to a `Dataset`. - This helper method simplifies the process of pushing data to a dataset. It opens the specified - dataset and then pushes the provided data to it. + This helper method simplifies the process of pushing data to a `Dataset`. It opens the specified + one and then pushes the provided data to it. Args: - data: The data to push to the dataset. - dataset_id: The ID of the dataset. - dataset_name: The name of the dataset. - kwargs: Keyword arguments to be passed to the dataset's `push_data` method. + data: The data to push to the `Dataset`. + dataset_id: The ID of the `Dataset`. + dataset_name: The name of the `Dataset`. + kwargs: Keyword arguments to be passed to the `Dataset.push_data()` method. """ dataset = await self.get_dataset(id=dataset_id, name=dataset_name) await dataset.push_data(data, **kwargs) diff --git a/src/crawlee/events/_event_manager.py b/src/crawlee/events/_event_manager.py index 07a3abe9b7..20ff924584 100644 --- a/src/crawlee/events/_event_manager.py +++ b/src/crawlee/events/_event_manager.py @@ -52,10 +52,11 @@ class EventManagerOptions(TypedDict): @docs_group('Classes') class EventManager: - """Event manager for registering, emitting, and managing event listeners. + """Manage events and their listeners, enabling registration, emission, and execution control. - Event manager allows you to register event listeners, emit events, and wait for event listeners to complete - their execution. It is built on top of the `pyee.asyncio.AsyncIOEventEmitter` class. + It allows for registering event listeners, emitting events, and ensuring all listeners complete their execution. + Built on top of `pyee.asyncio.AsyncIOEventEmitter`. It implements additional features such as waiting for all + listeners to complete and emitting `PersistState` events at regular intervals. """ def __init__( @@ -96,7 +97,7 @@ def __init__( @property def active(self) -> bool: - """Indicates whether the context is active.""" + """Indicate whether the context is active.""" return self._active async def __aenter__(self) -> EventManager: @@ -149,7 +150,7 @@ def on(self, *, event: Literal[Event.EXIT], listener: EventListener[EventExitDat def on(self, *, event: Event, listener: EventListener[None]) -> None: ... def on(self, *, event: Event, listener: EventListener[Any]) -> None: - """Add an event listener to the event manager. + """Register an event listener for a specific event. Args: event: The event for which to listen to. @@ -195,7 +196,7 @@ async def listener_wrapper(event_data: EventData) -> None: self._event_emitter.add_listener(event.value, listener_wrapper) def off(self, *, event: Event, listener: EventListener[Any] | None = None) -> None: - """Remove a listener, or all listeners, from an Actor event. + """Remove a specific listener or all listeners for an event. Args: event: The Actor event for which to remove listeners. @@ -225,7 +226,7 @@ def emit(self, *, event: Event, event_data: Any) -> None: ... @ensure_context def emit(self, *, event: Event, event_data: EventData) -> None: - """Emit an event. + """Emit an event with the associated data to all registered listeners. Args: event: The event which will be emitted. diff --git a/src/crawlee/events/_local_event_manager.py b/src/crawlee/events/_local_event_manager.py index b5d05ddc97..83dc5f8c8e 100644 --- a/src/crawlee/events/_local_event_manager.py +++ b/src/crawlee/events/_local_event_manager.py @@ -24,7 +24,12 @@ @docs_group('Classes') class LocalEventManager(EventManager): - """Local event manager for emitting system info events.""" + """Event manager for local environments. + + It extends the `EventManager` to emit `SystemInfo` events at regular intervals. The `LocalEventManager` + is intended to be used in local environments, where the system metrics are required managing the `Snapshotter` + and `AutoscaledPool`. + """ def __init__( self, @@ -52,10 +57,10 @@ def __init__( @classmethod def from_config(cls, config: Configuration | None = None) -> LocalEventManager: - """Create a new instance based on the provided configuration. + """Create a new instance based on the provided `Configuration`. Args: - config: The configuration object. Uses the global (default) configuration if not provided. + config: The `Configuration` instance. Uses the global (default) one if not provided. """ config = config or Configuration.get_global_configuration() diff --git a/src/crawlee/request_loaders/_request_loader.py b/src/crawlee/request_loaders/_request_loader.py index 3efb58de72..5d9d6df24d 100644 --- a/src/crawlee/request_loaders/_request_loader.py +++ b/src/crawlee/request_loaders/_request_loader.py @@ -27,19 +27,19 @@ class RequestLoader(ABC): @abstractmethod async def get_total_count(self) -> int: - """Returns an offline approximation of the total number of requests in the source (i.e. pending + handled).""" + """Return an offline approximation of the total number of requests in the source (i.e. pending + handled).""" @abstractmethod async def is_empty(self) -> bool: - """Returns True if there are no more requests in the source (there might still be unfinished requests).""" + """Return True if there are no more requests in the source (there might still be unfinished requests).""" @abstractmethod async def is_finished(self) -> bool: - """Returns True if all requests have been handled.""" + """Return True if all requests have been handled.""" @abstractmethod async def fetch_next_request(self) -> Request | None: - """Returns the next request to be processed, or `null` if there are no more pending requests.""" + """Return the next request to be processed, or `null` if there are no more pending requests.""" @abstractmethod async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None: @@ -47,7 +47,7 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | @abstractmethod async def get_handled_count(self) -> int: - """Returns the number of handled requests.""" + """Return the number of handled requests.""" async def to_tandem(self, request_manager: RequestManager | None = None) -> RequestManagerTandem: """Combine the loader with a request manager to support adding and reclaiming requests. diff --git a/src/crawlee/sessions/_session.py b/src/crawlee/sessions/_session.py index 963eb3a05a..ff5fba8f7e 100644 --- a/src/crawlee/sessions/_session.py +++ b/src/crawlee/sessions/_session.py @@ -15,15 +15,16 @@ @docs_group('Data structures') class Session: - """Session object represents a single user session. + """Represent a single user session, managing cookies, error states, and usage limits. - Sessions are used to store information such as cookies and can be used for generating fingerprints and proxy - sessions. You can imagine each session as a specific user, with its own cookies, IP (via proxy) and potentially - a unique browser fingerprint. Session internal state can be enriched with custom user data for example some - authorization tokens and specific headers in general. + A `Session` simulates a specific user with attributes like cookies, IP (via proxy), and potentially + a unique browser fingerprint. It maintains its internal state, which can include custom user data + (e.g., authorization tokens or headers) and tracks its usability through metrics such as error score, + usage count, and expiration. """ _DEFAULT_BLOCKED_STATUS_CODES: ClassVar = [401, 403, 429] + """Default status codes that indicate a session is blocked.""" def __init__( self, @@ -69,7 +70,7 @@ def __init__( @classmethod def from_model(cls, model: SessionModel) -> Session: - """Create a new instance from a SessionModel.""" + """Create a new instance from a `SessionModel`.""" return cls(**model.model_dump()) def __repr__(self) -> str: diff --git a/src/crawlee/sessions/_session_pool.py b/src/crawlee/sessions/_session_pool.py index c5dd99c066..e37eed02ca 100644 --- a/src/crawlee/sessions/_session_pool.py +++ b/src/crawlee/sessions/_session_pool.py @@ -26,7 +26,12 @@ @docs_group('Classes') class SessionPool: - """Session pool is a pool of sessions that are rotated based on the usage count or age.""" + """A pool of sessions that are managed, rotated, and persisted based on usage and age. + + It ensures effective session management by maintaining a pool of sessions and rotating them based on + usage count, expiration time, or custom rules. It provides methods to retrieve sessions, manage their + lifecycle, and optionally persist the state to enable recovery. + """ def __init__( self, @@ -95,7 +100,7 @@ def retired_session_count(self) -> int: @property def active(self) -> bool: - """Indicates whether the context is active.""" + """Indicate whether the context is active.""" return self._active async def __aenter__(self) -> SessionPool: @@ -177,10 +182,13 @@ def get_state(self, *, as_dict: bool = False) -> SessionPoolModel | dict: @ensure_context def add_session(self, session: Session) -> None: - """Add a specific session to the pool. + """Add an externally created session to the pool. This is intened only for the cases when you want to add a session that was created outside of the pool. Otherwise, the pool will create new sessions automatically. + + Args: + session: The session to add to the pool. """ if session.id in self._sessions: logger.warning(f'Session with ID {session.id} already exists in the pool.') diff --git a/src/crawlee/statistics/_statistics.py b/src/crawlee/statistics/_statistics.py index 00612f4753..6a669bb064 100644 --- a/src/crawlee/statistics/_statistics.py +++ b/src/crawlee/statistics/_statistics.py @@ -55,9 +55,12 @@ def retry_count(self) -> int: @docs_group('Classes') class Statistics(Generic[TStatisticsState]): - """An interface to collecting and logging runtime statistics for requests. + """A class for collecting, tracking, and logging runtime statistics for requests. - All information is saved to the key value store so that it persists between migrations, abortions and resurrections. + It is designed to record information such as request durations, retries, successes, and failures, enabling + analysis of crawler performance. The collected statistics are persisted to a `KeyValueStore`, ensuring they + remain available across crawler migrations, abortions, and restarts. This persistence allows for tracking + and evaluation of crawler behavior over its lifecycle. """ __next_id = 0 @@ -104,7 +107,7 @@ def __init__( @property def active(self) -> bool: - """Indicates whether the context is active.""" + """Indicate whether the context is active.""" return self._active async def __aenter__(self) -> Self: diff --git a/src/crawlee/storage_clients/_memory/_memory_storage_client.py b/src/crawlee/storage_clients/_memory/_memory_storage_client.py index 7dfde57376..38ad2918d8 100644 --- a/src/crawlee/storage_clients/_memory/_memory_storage_client.py +++ b/src/crawlee/storage_clients/_memory/_memory_storage_client.py @@ -96,10 +96,10 @@ def __init__( @classmethod def from_config(cls, config: Configuration | None = None) -> MemoryStorageClient: - """Create a new instance based on the provided configuration. + """Create a new instance based on the provided `Configuration`. Args: - config: The configuration object. Uses the global (default) configuration if not provided. + config: The `Configuration` instance. Uses the global (default) one if not provided. """ config = config or Configuration.get_global_configuration() diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index e45f95a93c..552f67b3e6 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -38,7 +38,7 @@ class GetDataKwargs(TypedDict): """The maximum number of items to retrieve. Unlimited if None.""" clean: NotRequired[bool] - """Returns only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.""" + """Return only non-empty items and excludes hidden fields. Shortcut for skip_hidden and skip_empty.""" desc: NotRequired[bool] """Set to True to sort results in descending order."""