Skip to content

Commit

Permalink
feat: Upgrade to Crawlee v0.5
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek committed Jan 3, 2025
1 parent ccba8d1 commit 2c6d513
Show file tree
Hide file tree
Showing 22 changed files with 432 additions and 197 deletions.
188 changes: 150 additions & 38 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ keywords = [
python = "^3.9"
apify-client = ">=1.8.1"
apify-shared = ">=1.2.1"
crawlee = "~0.4.0"
crawlee = "~0.5.0"
cryptography = ">=42.0.0"
httpx = ">=0.27.0"
lazy-object-proxy = ">=1.10.0"
Expand Down
62 changes: 36 additions & 26 deletions src/apify/_actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
from apify_client import ApifyClientAsync
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value
from crawlee import service_container
from crawlee import service_locator
from crawlee.events._types import Event, EventMigratingData, EventPersistStateData
from crawlee.storage_clients import MemoryStorageClient

from apify._configuration import Configuration
from apify._consts import EVENT_LISTENERS_TIMEOUT
Expand Down Expand Up @@ -71,17 +72,22 @@ def __init__(
self._configure_logging = configure_logging
self._apify_client = self.new_client()

self._event_manager: EventManager
if self._configuration.is_at_home:
self._event_manager = PlatformEventManager(
config=self._configuration,
persist_state_interval=self._configuration.persist_state_interval,
# We need to keep both local & cloud storage clients because of the `force_cloud` option.
self._local_storage_client = MemoryStorageClient.from_config(config=self.config)
self._cloud_storage_client = ApifyStorageClient.from_config(config=self.config)

# Set the event manager based on whether the Actor is running on the platform or locally.
self._event_manager = (
PlatformEventManager(
config=self.config,
persist_state_interval=self.config.persist_state_interval,
)
else:
self._event_manager = LocalEventManager(
system_info_interval=self._configuration.system_info_interval,
persist_state_interval=self._configuration.persist_state_interval,
if self.is_at_home()
else LocalEventManager(
system_info_interval=self.config.system_info_interval,
persist_state_interval=self.config.persist_state_interval,
)
)

self._is_initialized = False

Expand All @@ -94,9 +100,6 @@ async def __aenter__(self) -> Self:
When you exit the `async with` block, the `Actor.exit()` method is called, and if any exception happens while
executing the block code, the `Actor.fail` method is called.
"""
if self._configure_logging:
_configure_logging(self._configuration)

await self.init()
return self

Expand Down Expand Up @@ -184,18 +187,21 @@ async def init(self) -> None:
if self._is_initialized:
raise RuntimeError('The Actor was already initialized!')

if self._configuration.token:
service_container.set_cloud_storage_client(ApifyStorageClient(configuration=self._configuration))
self._is_exiting = False
self._was_final_persist_state_emitted = False

if self._configuration.is_at_home:
service_container.set_default_storage_client_type('cloud')
# Register services in the service locator.
if self.is_at_home():
service_locator.set_storage_client(self._cloud_storage_client)
else:
service_container.set_default_storage_client_type('local')
service_locator.set_storage_client(self._local_storage_client)

service_container.set_event_manager(self._event_manager)
service_locator.set_event_manager(self.event_manager)
service_locator.set_configuration(self.configuration)

self._is_exiting = False
self._was_final_persist_state_emitted = False
# The logging configuration has to be called after all service_locator set methods.
if self._configure_logging:
_configure_logging()

self.log.info('Initializing Actor...')
self.log.info('System info', extra=get_system_info())
Expand Down Expand Up @@ -245,7 +251,6 @@ async def finalize() -> None:
await self._event_manager.wait_for_all_listeners_to_complete(timeout=event_listeners_timeout)

await self._event_manager.__aexit__(None, None, None)
cast(dict, service_container._services).clear() # noqa: SLF001

await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds())
self._is_initialized = False
Expand Down Expand Up @@ -349,11 +354,13 @@ async def open_dataset(
self._raise_if_not_initialized()
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)

storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client()

return await Dataset.open(
id=id,
name=name,
configuration=self._configuration,
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
storage_client=storage_client,
)

async def open_key_value_store(
Expand Down Expand Up @@ -381,12 +388,13 @@ async def open_key_value_store(
"""
self._raise_if_not_initialized()
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client()

return await KeyValueStore.open(
id=id,
name=name,
configuration=self._configuration,
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
storage_client=storage_client,
)

async def open_request_queue(
Expand Down Expand Up @@ -417,11 +425,13 @@ async def open_request_queue(
self._raise_if_not_initialized()
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)

storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client()

return await RequestQueue.open(
id=id,
name=name,
configuration=self._configuration,
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
storage_client=storage_client,
)

async def push_data(self, data: dict | list[dict]) -> None:
Expand Down Expand Up @@ -963,7 +973,7 @@ async def create_proxy_configuration(
password: str | None = None,
groups: list[str] | None = None,
country_code: str | None = None,
proxy_urls: list[str] | None = None,
proxy_urls: list[str | None] | None = None,
new_url_function: _NewUrlFunction | None = None,
) -> ProxyConfiguration | None:
"""Create a ProxyConfiguration object with the passed proxy configuration.
Expand Down
12 changes: 12 additions & 0 deletions src/apify/_configuration.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from datetime import datetime, timedelta
from logging import getLogger
from typing import Annotated, Any

from pydantic import AliasChoices, BeforeValidator, Field
Expand All @@ -12,6 +13,8 @@

from apify._utils import docs_group

logger = getLogger(__name__)


def _transform_to_list(value: Any) -> list[str] | None:
if value is None:
Expand Down Expand Up @@ -353,6 +356,15 @@ class Configuration(CrawleeConfiguration):
),
] = None

@classmethod
def get_global_configuration(cls) -> Configuration:
"""Retrieve the global instance of the configuration.
Mostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`
instead.
"""
return cls()


# Monkey-patch the base class so that it works with the extended configuration
CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore[method-assign]
6 changes: 3 additions & 3 deletions src/apify/_proxy_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,9 @@ def __init__(
password: str | None = None,
groups: list[str] | None = None,
country_code: str | None = None,
proxy_urls: list[str] | None = None,
proxy_urls: list[str | None] | None = None,
new_url_function: _NewUrlFunction | None = None,
tiered_proxy_urls: list[list[str]] | None = None,
tiered_proxy_urls: list[list[str | None]] | None = None,
_actor_config: Configuration | None = None,
_apify_client: ApifyClientAsync | None = None,
) -> None:
Expand Down Expand Up @@ -148,7 +148,7 @@ def __init__(
' "groups" or "country_code".'
)

if proxy_urls and any('apify.com' in url for url in proxy_urls):
if proxy_urls and any('apify.com' in (url or '') for url in proxy_urls):
logger.warning(
'Some Apify proxy features may work incorrectly. Please consider setting up Apify properties '
'instead of `proxy_urls`.\n'
Expand Down
14 changes: 12 additions & 2 deletions src/apify/apify_storage_client/_apify_storage_client.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from typing_extensions import override

from apify_client import ApifyClientAsync
from crawlee._utils.crypto import crypto_random_object_id
from crawlee.base_storage_client import BaseStorageClient
from crawlee.storage_clients import BaseStorageClient

from apify._configuration import Configuration
from apify._utils import docs_group
from apify.apify_storage_client._dataset_client import DatasetClient
from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient
Expand All @@ -13,6 +16,9 @@
from apify.apify_storage_client._request_queue_client import RequestQueueClient
from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient

if TYPE_CHECKING:
from apify._configuration import Configuration


@docs_group('Classes')
class ApifyStorageClient(BaseStorageClient):
Expand All @@ -29,6 +35,10 @@ def __init__(self, *, configuration: Configuration) -> None:
)
self._configuration = configuration

@classmethod
def from_config(cls, config: Configuration) -> ApifyStorageClient:
return cls(configuration=config)

@override
def dataset(self, id: str) -> DatasetClient:
return DatasetClient(self._apify_client.dataset(id))
Expand Down
3 changes: 2 additions & 1 deletion src/apify/apify_storage_client/_dataset_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

from typing_extensions import override

from crawlee.base_storage_client import BaseDatasetClient, DatasetItemsListPage, DatasetMetadata
from crawlee.storage_clients._base import BaseDatasetClient
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata

if TYPE_CHECKING:
from collections.abc import AsyncIterator
Expand Down
3 changes: 2 additions & 1 deletion src/apify/apify_storage_client/_dataset_collection_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

from typing_extensions import override

from crawlee.base_storage_client import BaseDatasetCollectionClient, DatasetListPage, DatasetMetadata
from crawlee.storage_clients._base import BaseDatasetCollectionClient
from crawlee.storage_clients.models import DatasetListPage, DatasetMetadata

if TYPE_CHECKING:
from apify_client.clients import DatasetCollectionClientAsync
Expand Down
8 changes: 2 additions & 6 deletions src/apify/apify_storage_client/_key_value_store_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,8 @@

from typing_extensions import override

from crawlee.base_storage_client import (
BaseKeyValueStoreClient,
KeyValueStoreListKeysPage,
KeyValueStoreMetadata,
KeyValueStoreRecord,
)
from crawlee.storage_clients._base import BaseKeyValueStoreClient
from crawlee.storage_clients.models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord

if TYPE_CHECKING:
from collections.abc import AsyncIterator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

from typing_extensions import override

from crawlee.base_storage_client import BaseKeyValueStoreCollectionClient, KeyValueStoreListPage, KeyValueStoreMetadata
from crawlee.storage_clients._base import BaseKeyValueStoreCollectionClient
from crawlee.storage_clients.models import KeyValueStoreListPage, KeyValueStoreMetadata

if TYPE_CHECKING:
from apify_client.clients import KeyValueStoreCollectionClientAsync
Expand Down
4 changes: 2 additions & 2 deletions src/apify/apify_storage_client/_request_queue_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from typing_extensions import override

from crawlee import Request
from crawlee.base_storage_client import (
BaseRequestQueueClient,
from crawlee.storage_clients._base import BaseRequestQueueClient
from crawlee.storage_clients.models import (
BatchRequestsOperationResponse,
ProcessedRequest,
ProlongRequestLockResponse,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

from typing_extensions import override

from crawlee.base_storage_client import BaseRequestQueueCollectionClient, RequestQueueListPage, RequestQueueMetadata
from crawlee.storage_clients._base import BaseRequestQueueCollectionClient
from crawlee.storage_clients.models import RequestQueueListPage, RequestQueueMetadata

if TYPE_CHECKING:
from apify_client.clients import RequestQueueCollectionClientAsync
Expand Down
12 changes: 4 additions & 8 deletions src/apify/log.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
from __future__ import annotations

import logging
from typing import TYPE_CHECKING

from apify_shared.utils import ignore_docs
from crawlee._log_config import CrawleeLogFormatter, configure_logger, get_configured_log_level

if TYPE_CHECKING:
from apify import Configuration

# Name of the logger used throughout the library (resolves to 'apify')
logger_name = __name__.split('.')[0]

Expand All @@ -21,11 +17,11 @@ class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 (Inherited from pare
pass


def _configure_logging(configuration: Configuration) -> None:
def _configure_logging() -> None:
apify_client_logger = logging.getLogger('apify_client')
configure_logger(apify_client_logger, configuration, remove_old_handlers=True)
configure_logger(apify_client_logger, remove_old_handlers=True)

level = get_configured_log_level(configuration)
level = get_configured_log_level()

# Keep apify_client logger quiet unless debug logging is requested
if level > logging.DEBUG:
Expand All @@ -42,4 +38,4 @@ def _configure_logging(configuration: Configuration) -> None:

# Use configured log level for apify logger
apify_logger = logging.getLogger('apify')
configure_logger(apify_logger, configuration, remove_old_handlers=True)
configure_logger(apify_logger, remove_old_handlers=True)
2 changes: 1 addition & 1 deletion src/apify/storages/_request_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from crawlee import Request
from crawlee._types import HttpMethod
from crawlee.http_clients import BaseHttpClient, HttpxHttpClient
from crawlee.storages import RequestList as CrawleeRequestList
from crawlee.request_loaders import RequestList as CrawleeRequestList

from apify._utils import docs_group

Expand Down
Loading

0 comments on commit 2c6d513

Please sign in to comment.