diff --git a/src/apify/memory_storage/memory_storage.py b/src/apify/memory_storage/memory_storage.py index 276e07fb..3a171820 100644 --- a/src/apify/memory_storage/memory_storage.py +++ b/src/apify/memory_storage/memory_storage.py @@ -47,7 +47,7 @@ def __init__( persist_storage (bool, optional): Whether to persist the data to the `local_data_directory` or just keep them in memory write_metadata (bool, optional): Whether to persist metadata of the storages as well """ - self._local_data_directory = local_data_directory + self._local_data_directory = local_data_directory # TODO: Make this work with `APIFY_LOCAL_STORAGE_DIR` self._datasets_directory = os.path.join(self._local_data_directory, 'datasets') self._key_value_stores_directory = os.path.join(self._local_data_directory, 'key_value_stores') self._request_queues_directory = os.path.join(self._local_data_directory, 'request_queues') diff --git a/src/apify/storages/dataset.py b/src/apify/storages/dataset.py index c123138a..78a8a605 100644 --- a/src/apify/storages/dataset.py +++ b/src/apify/storages/dataset.py @@ -16,14 +16,44 @@ class Dataset: - """TODO: docs.""" + """The `Dataset` class represents a store for structured data where each object stored has the same attributes. + + You can imagine it as a table, where each object is a row and its attributes are columns. + Dataset is an append-only storage - you can only add new records to it but you cannot modify or remove existing records. + Typically it is used to store crawling results. + + Do not instantiate this class directly, use the `Actor.open_dataset()` function instead. + + `Dataset` stores its data either on local disk or in the Apify cloud, + depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variables are set. + + If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in + the local directory in the following files: + ``` + {APIFY_LOCAL_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json + ``` + Note that `{DATASET_ID}` is the name or ID of the dataset. The default dataset has ID: `default`, + unless you override it by setting the `APIFY_DEFAULT_DATASET_ID` environment variable. + Each dataset item is stored as a separate JSON file, where `{INDEX}` is a zero-based index of the item in the dataset. + + If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the + [Apify Dataset](https://docs.apify.com/storage/dataset) cloud storage. + """ _id: str _name: Optional[str] _client: Union[DatasetClientAsync, DatasetClient] def __init__(self, id: str, name: Optional[str], client: Union[ApifyClientAsync, MemoryStorage]) -> None: - """TODO: docs (constructor should be "internal").""" + """Create a `Dataset` instance. + + Do not use the constructor directly, use the `Dataset.open` function instead. + + Args: + id (str): ID of the dataset. + name (str, optional): Name of the dataset. + client (ApifyClientAsync or MemoryStorage): The storage client which should be used. + """ self.get_data = _wrap_internal(self._get_data_internal, self.get_data) # type: ignore self.push_data = _wrap_internal(self._push_data_internal, self.push_data) # type: ignore self.export_to_json = _wrap_internal(self._export_to_json_internal, self.export_to_json) # type: ignore @@ -47,7 +77,16 @@ def _get_default_name(cls, config: Configuration) -> str: @classmethod async def push_data(cls, data: JSONSerializable) -> None: - """TODO: docs.""" + """Store an object or an array of objects to the dataset. + + The size of the data is limited by the receiving API and therefore `push_data()` will only + allow objects whose JSON representation is smaller than 9MB. When an array is passed, + none of the included objects may be larger than 9MB, but the array itself may be of any size. + + Args: + data (JSONSerializable): dict or array of dicts containing data to be stored in the default dataset. + The JSON representation of each item must be smaller than 9MB. + """ dataset = await cls.open() return await dataset.push_data(data) @@ -89,7 +128,35 @@ async def get_data( flatten: Optional[List[str]] = None, view: Optional[str] = None, ) -> ListPage: - """TODO: docs.""" + """Get items from the dataset. + + Args: + offset (int, optional): Number of items that should be skipped at the start. The default value is 0 + limit (int, optional): Maximum number of items to return. By default there is no limit. + desc (bool, optional): By default, results are returned in the same order as they were stored. + To reverse the order, set this parameter to True. + clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character). + The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters. + Note that since some objects might be skipped from the output, that the result might contain less items than the limit value. + fields (list of str, optional): A list of fields which should be picked from the items, + only these fields will remain in the resulting record objects. + Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter. + You can use this feature to effectively fix the output format. + omit (list of str, optional): A list of fields which should be omitted from the items. + unwind (str, optional): Name of a field which should be unwound. + If the field is an array then every element of the array will become a separate record and merged with parent object. + If the unwound field is an object then it is merged with the parent object. + If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object, + then the item gets preserved as it is. Note that the unwound items ignore the desc parameter. + skip_empty (bool, optional): If True, then empty items are skipped from the output. + Note that if used, the results might contain less items than the limit value. + skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character. + flatten (list of str, optional): A list of fields that should be flattened + view (str, optional): Name of the dataset view to be used + + Returns: + ListPage: A page of the list of dataset items according to the specified filters. + """ dataset = await cls.open() return await dataset.get_data( offset=offset, @@ -151,7 +218,14 @@ async def export_to( to_key_value_store: Optional[str] = None, content_type: Optional[str] = None, ) -> None: - """TODO: docs.""" + """Save the entirety of the dataset's contents into one file within a key-value store. + + Args: + key (str): The key to save the data under. + to_key_value_store (str, optional): The name of the key-value store in which the result will be saved. + Uses default key-value store if omitted. + content_type (str, optional): Either 'text/csv' or 'application/json'. Defaults to JSON. + """ key_value_store = await KeyValueStore.open(to_key_value_store) items: List[Dict] = [] limit = 1000 @@ -186,7 +260,14 @@ async def export_to_json( from_dataset: Optional[str] = None, to_key_value_store: Optional[str] = None, ) -> None: - """TODO: docs.""" + """Save the entirety of the dataset's contents into one JSON file within a key-value store. + + Args: + key (str): The key to save the data under. + from_dataset (str, optional): The source dataset in case of calling the class method. Uses default dataset if omitted. + to_key_value_store (str, optional): The name of the key-value store in which the result will be saved. + Uses default key-value store if omitted. + """ dataset = await cls.open(from_dataset) await dataset.export_to_json(key, to_key_value_store=to_key_value_store) @@ -207,7 +288,14 @@ async def export_to_csv( from_dataset: Optional[str] = None, to_key_value_store: Optional[str] = None, ) -> None: - """TODO: docs.""" + """Save the entirety of the dataset's contents into one CSV file within a key-value store. + + Args: + key (str): The key to save the data under. + from_dataset (str, optional): The source dataset in case of calling the class method. Uses default dataset if omitted. + to_key_value_store (str, optional): The name of the key-value store in which the result will be saved. + Uses default key-value store if omitted. + """ dataset = await cls.open(from_dataset) await dataset.export_to_csv(key, to_key_value_store=to_key_value_store) @@ -221,10 +309,14 @@ async def _export_to_csv_internal( await self.export_to(key, to_key_value_store=to_key_value_store, content_type='text/csv') async def get_info(self) -> Optional[Dict]: - """TODO: docs.""" + """Get an object containing general information about the dataset. + + Returns: + dict: Object returned by calling the GET dataset API endpoint. + """ return await self._client.get() - def iterate_items( # ~forEach in TS + def iterate_items( self, *, offset: int = 0, @@ -237,7 +329,33 @@ def iterate_items( # ~forEach in TS skip_empty: Optional[bool] = None, skip_hidden: Optional[bool] = None, ) -> AsyncIterator[Dict]: - """TODO: docs.""" + """Iterate over the items in the dataset. + + Args: + offset (int, optional): Number of items that should be skipped at the start. The default value is 0 + limit (int, optional): Maximum number of items to return. By default there is no limit. + desc (bool, optional): By default, results are returned in the same order as they were stored. + To reverse the order, set this parameter to True. + clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character). + The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters. + Note that since some objects might be skipped from the output, that the result might contain less items than the limit value. + fields (list of str, optional): A list of fields which should be picked from the items, + only these fields will remain in the resulting record objects. + Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter. + You can use this feature to effectively fix the output format. + omit (list of str, optional): A list of fields which should be omitted from the items. + unwind (str, optional): Name of a field which should be unwound. + If the field is an array then every element of the array will become a separate record and merged with parent object. + If the unwound field is an object then it is merged with the parent object. + If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object, + then the item gets preserved as it is. Note that the unwound items ignore the desc parameter. + skip_empty (bool, optional): If True, then empty items are skipped from the output. + Note that if used, the results might contain less items than the limit value. + skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character. + + Yields: + dict: An item from the dataset + """ return self._client.iterate_items( offset=offset, limit=limit, @@ -251,11 +369,25 @@ def iterate_items( # ~forEach in TS ) async def drop(self) -> None: - """TODO: docs.""" + """Remove the dataset either from the Apify cloud storage or from the local directory.""" await self._client.delete() await StorageManager.close_storage(self.__class__, self._id, self._name) @classmethod async def open(cls, dataset_id_or_name: Optional[str] = None, config: Optional[Configuration] = None) -> 'Dataset': - """TODO: docs.""" + """Open a dataset. + + Datasets are used to store structured data where each object stored has the same attributes, + such as online store products or real estate offers. + The actual data is stored either on the local filesystem or in the Apify cloud. + + Args: + dataset_id_or_name (str, optional): ID or name of the dataset to be opened. + If not provided, the method returns the default dataset associated with the actor run. + config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted. + + Returns: + Dataset: An instance of the `Dataset` class for the given ID or name. + + """ return await StorageManager.open_storage(cls, dataset_id_or_name, None, config) diff --git a/src/apify/storages/key_value_store.py b/src/apify/storages/key_value_store.py index 6f6f4c58..7780227d 100644 --- a/src/apify/storages/key_value_store.py +++ b/src/apify/storages/key_value_store.py @@ -1,4 +1,4 @@ -from typing import Any, AsyncIterator, Dict, Optional, Tuple, TypeVar, Union, overload +from typing import Any, AsyncIterator, NamedTuple, Optional, TypedDict, TypeVar, Union, overload from apify_client import ApifyClientAsync from apify_client.clients import KeyValueStoreClientAsync @@ -10,17 +10,56 @@ from .storage_manager import StorageManager T = TypeVar('T') +IterateKeysInfo = TypedDict('IterateKeysInfo', {'size': int}) +IterateKeysTuple = NamedTuple('IterateKeysTuple', [('key', str), ('info', IterateKeysInfo)]) class KeyValueStore: - """TODO: docs.""" + """The `KeyValueStore` class represents a key-value store. + + You can imagine it as a simple data storage that is used + for saving and reading data records or files. Each data record is + represented by a unique key and associated with a MIME content type. + + Do not instantiate this class directly, use the `Actor.open_key_value_store()` function instead. + + Each crawler run is associated with a default key-value store, which is created exclusively + for the run. By convention, the crawler input and output are stored into the + default key-value store under the `INPUT` and `OUTPUT` key, respectively. + Typically, input and output are JSON files, although it can be any other format. + To access the default key-value store directly, you can use the + `KeyValueStore.get_value` and `KeyValueStore.set_value` convenience functions. + + `KeyValueStore` stores its data either on local disk or in the Apify cloud, + depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variables are set. + + If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in + the local directory in the following files: + ``` + {APIFY_LOCAL_STORAGE_DIR}/key_value_stores/{STORE_ID}/{INDEX}.{EXT} + ``` + Note that `{STORE_ID}` is the name or ID of the key-value store. The default key-value store has ID: `default`, + unless you override it by setting the `APIFY_DEFAULT_KEY_VALUE_STORE_ID` environment variable. + The `{KEY}` is the key of the record and `{EXT}` corresponds to the MIME content type of the data value. + + If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the + [Apify Key-value store](https://docs.apify.com/storage/key-value-store) cloud storage. + """ _id: str _name: Optional[str] _client: Union[KeyValueStoreClientAsync, KeyValueStoreClient] def __init__(self, id: str, name: Optional[str], client: Union[ApifyClientAsync, MemoryStorage]) -> None: - """TODO: docs (constructor should be "internal").""" + """Create a `KeyValueStore` instance. + + Do not use the constructor directly, use the `KeyValueStore.open` function instead. + + Args: + id (str): ID of the key-value store. + name (str, optional): Name of the key-value store. + client (ApifyClientAsync or MemoryStorage): The storage client which should be used. + """ self.get_value = _wrap_internal(self._get_value_internal, self.get_value) # type: ignore self.set_value = _wrap_internal(self._set_value_internal, self.set_value) # type: ignore self._id = id @@ -29,7 +68,6 @@ def __init__(self, id: str, name: Optional[str], client: Union[ApifyClientAsync, @classmethod async def _create_instance(cls, store_id_or_name: str, client: Union[ApifyClientAsync, MemoryStorage]) -> 'KeyValueStore': - """TODO: docs.""" key_value_store_client = client.key_value_store(store_id_or_name) key_value_store_info = await key_value_store_client.get() if not key_value_store_info: @@ -58,7 +96,15 @@ async def get_value(cls, key: str, default_value: Optional[T] = None) -> Optiona @classmethod async def get_value(cls, key: str, default_value: Optional[T] = None) -> Optional[T]: - """TODO: docs.""" + """Get a value from the key-value store. + + Args: + key (str): Key of the record to retrieve. + default_value (Any, optional): Default value returned in case the record does not exist. + + Returns: + Any: The value associated with the given key. `default_value` is used in case the record does not exist. + """ store = await cls.open() return await store.get_value(key, default_value) @@ -66,13 +112,22 @@ async def _get_value_internal(self, key: str, default_value: Optional[T] = None) record = await self._client.get_record(key) return record['value'] if record else default_value - async def for_each_key(self, exclusive_start_key: Optional[str] = None) -> AsyncIterator[Tuple[Dict, int, int]]: - """TODO: docs.""" + async def iterate_keys(self, exclusive_start_key: Optional[str] = None) -> AsyncIterator[IterateKeysTuple]: + """Iterate over the keys in the key-value store. + + Args: + exclusive_start_key (str, optional): All keys up to this one (including) are skipped from the result. + + Yields: + IterateKeysTuple: A tuple `(key, info)`, + where `key` is the record key, and `info` is an object that contains a single property `size` + indicating size of the record in bytes. + """ index = 0 while True: list_keys = await self._client.list_keys(exclusive_start_key=exclusive_start_key) for item in list_keys['items']: - yield item, index, item['size'] + yield IterateKeysTuple(item['key'], {'size': item['size']}) index += 1 if not list_keys['isTruncated']: @@ -81,7 +136,13 @@ async def for_each_key(self, exclusive_start_key: Optional[str] = None) -> Async @classmethod async def set_value(cls, key: str, value: Optional[T], content_type: Optional[str] = None) -> None: - """TODO: docs.""" + """Set or delete a value in the key-value store. + + Args: + key (str): The key under which the value should be saved. + value (Any, optional): The value to save. If the value is `None`, the corresponding key-value pair will be deleted. + content_type (str, optional): The content type of the saved value. + """ store = await cls.open() return await store.set_value(key, value, content_type) @@ -95,11 +156,24 @@ async def _set_value_internal(self, key: str, value: Optional[T], content_type: return await self._client.set_record(key, value, content_type) async def drop(self) -> None: - """TODO: docs.""" + """Remove the key-value store either from the Apify cloud storage or from the local directory.""" await self._client.delete() await StorageManager.close_storage(self.__class__, self._id, self._name) @classmethod async def open(cls, store_id_or_name: Optional[str] = None, config: Optional[Configuration] = None) -> 'KeyValueStore': - """TODO: docs.""" + """Open a key-value store. + + Key-value stores are used to store records or files, along with their MIME content type. + The records are stored and retrieved using a unique key. + The actual data is stored either on a local filesystem or in the Apify cloud. + + Args: + key_value_store_id_or_name (str, optional): ID or name of the key-value store to be opened. + If not provided, the method returns the default key-value store associated with the actor run. + config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted. + + Returns: + KeyValueStore: An instance of the `KeyValueStore` class for the given ID or name. + """ return await StorageManager.open_storage(cls, store_id_or_name, None, config) diff --git a/src/apify/storages/request_queue.py b/src/apify/storages/request_queue.py index 90bc7a3b..1e8ea51f 100644 --- a/src/apify/storages/request_queue.py +++ b/src/apify/storages/request_queue.py @@ -49,7 +49,35 @@ class RequestQueue: - """TODO: docs.""" + """Represents a queue of URLs to crawl. + + Can be used for deep crawling of websites where you start with several URLs and then recursively + follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders. + + Each URL is represented using an instance of the {@apilink Request} class. + The queue can only contain unique URLs. More precisely, it can only contain request dictionaries + with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden. + To add a single URL multiple times to the queue, + corresponding request dictionary will need to have different `uniqueKey` properties. + + Do not instantiate this class directly, use the `Actor.open_request_queue()` function instead. + + `RequestQueue` stores its data either on local disk or in the Apify cloud, + depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variables are set. + + If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in + the local directory in the following files: + ``` + {APIFY_LOCAL_STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json + ``` + Note that `{QUEUE_ID}` is the name or ID of the request queue. The default request queue has ID: `default`, + unless you override it by setting the `APIFY_DEFAULT_REQUEST_QUEUE_ID` environment variable. + The `{REQUEST_ID}` is the id of the request. + + If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the + [Apify Request Queue](https://docs.apify.com/storage/request-queue) + cloud storage. + """ _id: str _name: Optional[str] @@ -66,7 +94,15 @@ class RequestQueue: _requests_cache: LRUCache[Dict] def __init__(self, id: str, name: Optional[str], client: Union[ApifyClientAsync, MemoryStorage]) -> None: - """TODO: docs (constructor should be "internal").""" + """Create a `RequestQueue` instance. + + Do not use the constructor directly, use the `RequestQueue.open` function instead. + + Args: + id (str): ID of the request queue. + name (str, optional): Name of the request queue. + client (ApifyClientAsync or MemoryStorage): The storage client which should be used. + """ self._id = id self._name = name self._client = client.request_queue(self._id, client_key=self._client_key) @@ -90,13 +126,17 @@ async def _create_instance(cls, request_queue_id_or_name: str, client: Union[Api def _get_default_name(cls, config: Configuration) -> str: return config.default_request_queue_id - async def add_request(self, request_like: Dict, *, forefront: bool = False) -> Dict: # TODO: Validate request with pydantic - """TODO: docs.""" + async def add_request(self, request: Dict, *, forefront: bool = False) -> Dict: # TODO: Validate request with pydantic + """Add a request to the queue. + + Args: + request (dict): The request to add to the queue + forefront (bool, optional): Whether to add the request to the head or the end of the queue + + Returns: + dict: Information about the queue operation with keys `requestId`, `uniqueKey`, `wasAlreadyPresent`, `wasAlreadyHandled`. + """ self._last_activity = datetime.utcnow() - # const request = requestLike instanceof Request - # ? requestLike - # : new Request(requestLike); - request = request_like cache_key = _unique_key_to_request_id(request['uniqueKey']) cached_info = self._requests_cache.get(cache_key) @@ -126,11 +166,30 @@ async def add_request(self, request_like: Dict, *, forefront: bool = False) -> D return queue_operation_info async def get_request(self, request_id: str) -> Optional[Dict]: - """TODO: docs.""" + """Retrieve a request from the queue. + + Args: + request_id (str): ID of the request to retrieve. + + Returns: + dict, optional: The retrieved request, or `None`, if it does not exist. + """ return await self._client.get_request(request_id) # TODO: Maybe create a Request class? async def fetch_next_request(self) -> Optional[Dict]: - """TODO: docs.""" + """Return the next request in the queue to be processed. + + Once you successfully finish processing of the request, you need to call + `RequestQueue.mark_request_as_handled` to mark the request as handled in the queue. + If there was some error in processing the request, call `RequestQueue.reclaim_request` instead, + so that the queue will give the request to some other consumer in another call to the `fetch_next_request` method. + + Note that the `None` return value does not mean the queue processing finished, it means there are currently no pending requests. + To check whether all requests in queue were finished, use `RequestQueue.is_finished` instead. + + Returns: + dict, optional: The request or `None` if there are no more pending requests. + """ await self._ensure_head_is_non_empty() # We are likely done at this point. @@ -183,7 +242,17 @@ async def fetch_next_request(self) -> Optional[Dict]: return request async def mark_request_as_handled(self, request: Dict) -> Optional[Dict]: # TODO: Validate request with pydantic - """TODO: docs.""" + """Mark a request as handled after successful processing. + + Handled requests will never again be returned by the `RequestQueue.fetch_next_request` method. + + Args: + request (dict): The request to mark as handled. + + Returns: + dict, optional: Information about the queue operation with keys `requestId`, `uniqueKey`, `wasAlreadyPresent`, `wasAlreadyHandled`. + `None` if the given request was not in progress. + """ self._last_activity = datetime.utcnow() if request['id'] not in self._in_progress: logging.debug(f'Cannot mark request {request["id"]} as handled, because it is not in progress!') @@ -204,7 +273,18 @@ async def mark_request_as_handled(self, request: Dict) -> Optional[Dict]: # TOD return queue_operation_info async def reclaim_request(self, request: Dict, forefront: bool = False) -> Optional[Dict]: # TODO: Validate request with pydantic - """TODO: docs.""" + """Reclaim a failed request back to the queue. + + The request will be returned for processing later again + by another call to `RequestQueue.fetchNextRequest`. + + Args: + request (dict): The request to return to the queue. + forefront (bool, optional): Whether to add the request to the head or the end of the queue + Returns: + dict, optional: Information about the queue operation with keys `requestId`, `uniqueKey`, `wasAlreadyPresent`, `wasAlreadyHandled`. + `None` if the given request was not in progress. + """ self._last_activity = datetime.utcnow() if request['id'] not in self._in_progress: @@ -237,12 +317,24 @@ def _in_progress_count(self) -> int: return len(self._in_progress) async def is_empty(self) -> bool: - """TODO: docs.""" + """Check whether the queue is empty. + + Returns: + bool: `True` if the next call to `RequestQueue.fetchNextRequest` would return `None`, otherwise `False`. + """ await self._ensure_head_is_non_empty() return len(self._queue_head_dict) == 0 async def is_finished(self) -> bool: - """TODO: docs.""" + """Check whether the queue is finished. + + Due to the nature of distributed storage used by the queue, + the function might occasionally return a false negative, + but it will never return a false positive. + + Returns: + bool: `True` if all requests were already handled and there are no more left. `False` otherwise. + """ if self._in_progress_count() > 0 and (datetime.utcnow() - self._last_activity).seconds > self._internal_timeout_seconds: message = f'The request queue seems to be stuck for {self._internal_timeout_seconds}s, resetting internal state.' logging.warning(message) @@ -366,15 +458,33 @@ def _maybe_add_request_to_queue_head(self, request_id: str, forefront: bool) -> self._queue_head_dict[request_id] = request_id async def drop(self) -> None: - """TODO: docs.""" + """Remove the request queue either from the Apify cloud storage or from the local directory.""" await self._client.delete() await StorageManager.close_storage(self.__class__, self._id, self._name) async def get_info(self) -> Optional[Dict]: - """TODO: docs.""" + """Get an object containing general information about the request queue. + + Returns: + dict: Object returned by calling the GET request queue API endpoint. + """ return await self._client.get() @classmethod async def open(cls, request_queue_id_or_name: Optional[str] = None, config: Optional[Configuration] = None) -> 'RequestQueue': - """TODO: docs.""" + """Open a request queue. + + Request queue represents a queue of URLs to crawl, which is stored either on local filesystem or in the Apify cloud. + The queue is used for deep crawling of websites, where you start with several URLs and then + recursively follow links to other pages. The data structure supports both breadth-first + and depth-first crawling orders. + + Args: + request_queue_id_or_name (str, optional): ID or name of the request queue to be opened. + If not provided, the method returns the default request queue associated with the actor run. + config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted. + + Returns: + RequestQueue: An instance of the `RequestQueue` class for the given ID or name. + """ return await StorageManager.open_storage(cls, request_queue_id_or_name, None, config) diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py index 79b975fb..191f7487 100644 --- a/tests/unit/storages/test_key_value_store.py +++ b/tests/unit/storages/test_key_value_store.py @@ -40,14 +40,16 @@ async def test_get_set_value(key_value_store: KeyValueStore) -> None: async def test_for_each_key(key_value_store: KeyValueStore) -> None: - keys = [i async for i in key_value_store.for_each_key()] + keys = [i async for i in key_value_store.iterate_keys()] assert len(keys) == 0 for i in range(2001): await key_value_store.set_value(str(i).zfill(4), i) - async for item, index, _ in key_value_store.for_each_key(): - assert item['key'] == str(index).zfill(4) - assert index == 2000 + index = 0 + async for key, _ in key_value_store.iterate_keys(): + assert key == str(index).zfill(4) + index += 1 + assert index == 2001 async def test_static_get_set_value() -> None: