diff --git a/poetry.lock b/poetry.lock index ffa33dbe..d88aaa9f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -752,36 +752,36 @@ toml = ["tomli"] [[package]] name = "crawlee" -version = "0.4.5" +version = "0.5.0" description = "Crawlee for Python" optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "crawlee-0.4.5-py3-none-any.whl", hash = "sha256:cbb730bbd9fd08671344af9ff46af2be6e50a3e57011b82e58782c058f8c76c5"}, - {file = "crawlee-0.4.5.tar.gz", hash = "sha256:c657eeb59d5471791e20801be23699cd1f03ad7883f126adedb6b3ed008c281d"}, + {file = "crawlee-0.5.0-py3-none-any.whl", hash = "sha256:ddab7bf7883b649d412c2b1ebfea9c4f44c71c55ec4b03f12c732fcc9c3a4ea1"}, + {file = "crawlee-0.5.0.tar.gz", hash = "sha256:64261648a8b20f6a63061c355b2155bacafad31cb201002fefb8c1c55611e764"}, ] [package.dependencies] +apify = ">=2.0.0" colorama = ">=0.4.0" cookiecutter = ">=2.6.0" docutils = ">=0.21.0" eval-type-backport = ">=0.2.0" -httpx = {version = ">=0.27.0,<0.28.0", extras = ["brotli", "http2"]} +httpx = {version = ">=0.27.0", extras = ["brotli", "http2", "zstd"]} inquirer = ">=3.3.0" more_itertools = ">=10.2.0" psutil = ">=6.0.0" pydantic = ">=2.8.1,<2.10.0 || >2.10.0,<2.10.1 || >2.10.1,<2.10.2 || >2.10.2" -pydantic-settings = ">=2.2.0" +pydantic-settings = ">=2.2.0,<2.7.0" pyee = ">=9.0.0" sortedcollections = ">=2.1.0" tldextract = ">=5.1.0" typer = ">=0.12.0" typing-extensions = ">=4.1.0" -yarl = ">=1.18.0,<2.0.0" +yarl = ">=1.18.0" [package.extras] -all = ["apify (>=2.0.0)", "beautifulsoup4 (>=4.12.0)", "curl-cffi (>=0.7.2)", "html5lib (>=1.0)", "lxml (>=5.2.0)", "playwright (>=1.27.0)"] -apify = ["apify (>=2.0.0)"] +all = ["beautifulsoup4 (>=4.12.0)", "curl-cffi (>=0.7.2)", "html5lib (>=1.0)", "lxml (>=5.2.0)", "parsel (>=1.9.0)", "playwright (>=1.27.0)"] beautifulsoup = ["beautifulsoup4 (>=4.12.0)", "html5lib (>=1.0)", "lxml (>=5.2.0)"] curl-impersonate = ["curl-cffi (>=0.7.2)"] parsel = ["parsel (>=1.9.0)"] @@ -1136,13 +1136,13 @@ trio = ["trio (>=0.22.0,<1.0)"] [[package]] name = "httpx" -version = "0.27.2" +version = "0.28.1" description = "The next generation HTTP client." optional = false python-versions = ">=3.8" files = [ - {file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"}, - {file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"}, + {file = "httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad"}, + {file = "httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc"}, ] [package.dependencies] @@ -1153,7 +1153,7 @@ certifi = "*" h2 = {version = ">=3,<5", optional = true, markers = "extra == \"http2\""} httpcore = "==1.*" idna = "*" -sniffio = "*" +zstandard = {version = ">=0.18.0", optional = true, markers = "extra == \"zstd\""} [package.extras] brotli = ["brotli", "brotlicffi"] @@ -2281,13 +2281,13 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" [[package]] name = "pydantic-settings" -version = "2.7.0" +version = "2.6.1" description = "Settings management using Pydantic" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic_settings-2.7.0-py3-none-any.whl", hash = "sha256:e00c05d5fa6cbbb227c84bd7487c5c1065084119b750df7c8c1a554aed236eb5"}, - {file = "pydantic_settings-2.7.0.tar.gz", hash = "sha256:ac4bfd4a36831a48dbf8b2d9325425b549a0a6f18cea118436d728eb4f1c4d66"}, + {file = "pydantic_settings-2.6.1-py3-none-any.whl", hash = "sha256:7fb0637c786a558d3103436278a7c4f1cfd29ba8973238a50c5bb9a55387da87"}, + {file = "pydantic_settings-2.6.1.tar.gz", hash = "sha256:e0f92546d8a9923cb8941689abf85d6601a8c19a23e97a34b2964a2e3f813ca0"}, ] [package.dependencies] @@ -2434,13 +2434,13 @@ dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments [[package]] name = "pytest-asyncio" -version = "0.25.0" +version = "0.25.1" description = "Pytest support for asyncio" optional = false python-versions = ">=3.9" files = [ - {file = "pytest_asyncio-0.25.0-py3-none-any.whl", hash = "sha256:db5432d18eac6b7e28b46dcd9b69921b55c3b1086e85febfe04e70b18d9e81b3"}, - {file = "pytest_asyncio-0.25.0.tar.gz", hash = "sha256:8c0610303c9e0442a5db8604505fc0f545456ba1528824842b37b4a626cbf609"}, + {file = "pytest_asyncio-0.25.1-py3-none-any.whl", hash = "sha256:c84878849ec63ff2ca509423616e071ef9cd8cc93c053aa33b5b8fb70a990671"}, + {file = "pytest_asyncio-0.25.1.tar.gz", hash = "sha256:79be8a72384b0c917677e00daa711e07db15259f4d23203c59012bcd989d4aee"}, ] [package.dependencies] @@ -2715,29 +2715,29 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "ruff" -version = "0.8.4" +version = "0.8.5" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.8.4-py3-none-linux_armv6l.whl", hash = "sha256:58072f0c06080276804c6a4e21a9045a706584a958e644353603d36ca1eb8a60"}, - {file = "ruff-0.8.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:ffb60904651c00a1e0b8df594591770018a0f04587f7deeb3838344fe3adabac"}, - {file = "ruff-0.8.4-py3-none-macosx_11_0_arm64.whl", hash = "sha256:6ddf5d654ac0d44389f6bf05cee4caeefc3132a64b58ea46738111d687352296"}, - {file = "ruff-0.8.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e248b1f0fa2749edd3350a2a342b67b43a2627434c059a063418e3d375cfe643"}, - {file = "ruff-0.8.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bf197b98ed86e417412ee3b6c893f44c8864f816451441483253d5ff22c0e81e"}, - {file = "ruff-0.8.4-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c41319b85faa3aadd4d30cb1cffdd9ac6b89704ff79f7664b853785b48eccdf3"}, - {file = "ruff-0.8.4-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:9f8402b7c4f96463f135e936d9ab77b65711fcd5d72e5d67597b543bbb43cf3f"}, - {file = "ruff-0.8.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e4e56b3baa9c23d324ead112a4fdf20db9a3f8f29eeabff1355114dd96014604"}, - {file = "ruff-0.8.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:736272574e97157f7edbbb43b1d046125fce9e7d8d583d5d65d0c9bf2c15addf"}, - {file = "ruff-0.8.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5fe710ab6061592521f902fca7ebcb9fabd27bc7c57c764298b1c1f15fff720"}, - {file = "ruff-0.8.4-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:13e9ec6d6b55f6da412d59953d65d66e760d583dd3c1c72bf1f26435b5bfdbae"}, - {file = "ruff-0.8.4-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:97d9aefef725348ad77d6db98b726cfdb075a40b936c7984088804dfd38268a7"}, - {file = "ruff-0.8.4-py3-none-musllinux_1_2_i686.whl", hash = "sha256:ab78e33325a6f5374e04c2ab924a3367d69a0da36f8c9cb6b894a62017506111"}, - {file = "ruff-0.8.4-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:8ef06f66f4a05c3ddbc9121a8b0cecccd92c5bf3dd43b5472ffe40b8ca10f0f8"}, - {file = "ruff-0.8.4-py3-none-win32.whl", hash = "sha256:552fb6d861320958ca5e15f28b20a3d071aa83b93caee33a87b471f99a6c0835"}, - {file = "ruff-0.8.4-py3-none-win_amd64.whl", hash = "sha256:f21a1143776f8656d7f364bd264a9d60f01b7f52243fbe90e7670c0dfe0cf65d"}, - {file = "ruff-0.8.4-py3-none-win_arm64.whl", hash = "sha256:9183dd615d8df50defa8b1d9a074053891ba39025cf5ae88e8bcb52edcc4bf08"}, - {file = "ruff-0.8.4.tar.gz", hash = "sha256:0d5f89f254836799af1615798caa5f80b7f935d7a670fad66c5007928e57ace8"}, + {file = "ruff-0.8.5-py3-none-linux_armv6l.whl", hash = "sha256:5ad11a5e3868a73ca1fa4727fe7e33735ea78b416313f4368c504dbeb69c0f88"}, + {file = "ruff-0.8.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:f69ab37771ea7e0715fead8624ec42996d101269a96e31f4d31be6fc33aa19b7"}, + {file = "ruff-0.8.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:b5462d7804558ccff9c08fe8cbf6c14b7efe67404316696a2dde48297b1925bb"}, + {file = "ruff-0.8.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d56de7220a35607f9fe59f8a6d018e14504f7b71d784d980835e20fc0611cd50"}, + {file = "ruff-0.8.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9d99cf80b0429cbebf31cbbf6f24f05a29706f0437c40413d950e67e2d4faca4"}, + {file = "ruff-0.8.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b75ac29715ac60d554a049dbb0ef3b55259076181c3369d79466cb130eb5afd"}, + {file = "ruff-0.8.5-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:c9d526a62c9eda211b38463528768fd0ada25dad524cb33c0e99fcff1c67b5dc"}, + {file = "ruff-0.8.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:587c5e95007612c26509f30acc506c874dab4c4abbacd0357400bd1aa799931b"}, + {file = "ruff-0.8.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:622b82bf3429ff0e346835ec213aec0a04d9730480cbffbb6ad9372014e31bbd"}, + {file = "ruff-0.8.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f99be814d77a5dac8a8957104bdd8c359e85c86b0ee0e38dca447cb1095f70fb"}, + {file = "ruff-0.8.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:c01c048f9c3385e0fd7822ad0fd519afb282af9cf1778f3580e540629df89725"}, + {file = "ruff-0.8.5-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:7512e8cb038db7f5db6aae0e24735ff9ea03bb0ed6ae2ce534e9baa23c1dc9ea"}, + {file = "ruff-0.8.5-py3-none-musllinux_1_2_i686.whl", hash = "sha256:762f113232acd5b768d6b875d16aad6b00082add40ec91c927f0673a8ec4ede8"}, + {file = "ruff-0.8.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:03a90200c5dfff49e4c967b405f27fdfa81594cbb7c5ff5609e42d7fe9680da5"}, + {file = "ruff-0.8.5-py3-none-win32.whl", hash = "sha256:8710ffd57bdaa6690cbf6ecff19884b8629ec2a2a2a2f783aa94b1cc795139ed"}, + {file = "ruff-0.8.5-py3-none-win_amd64.whl", hash = "sha256:4020d8bf8d3a32325c77af452a9976a9ad6455773bcb94991cf15bd66b347e47"}, + {file = "ruff-0.8.5-py3-none-win_arm64.whl", hash = "sha256:134ae019ef13e1b060ab7136e7828a6d83ea727ba123381307eb37c6bd5e01cb"}, + {file = "ruff-0.8.5.tar.gz", hash = "sha256:1098d36f69831f7ff2a1da3e6407d5fbd6dfa2559e4f74ff2d260c5588900317"}, ] [[package]] @@ -3524,10 +3524,122 @@ docs = ["Sphinx", "furo", "repoze.sphinx.autointerface"] test = ["coverage[toml]", "zope.event", "zope.testing"] testing = ["coverage[toml]", "zope.event", "zope.testing"] +[[package]] +name = "zstandard" +version = "0.23.0" +description = "Zstandard bindings for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "zstandard-0.23.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bf0a05b6059c0528477fba9054d09179beb63744355cab9f38059548fedd46a9"}, + {file = "zstandard-0.23.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fc9ca1c9718cb3b06634c7c8dec57d24e9438b2aa9a0f02b8bb36bf478538880"}, + {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77da4c6bfa20dd5ea25cbf12c76f181a8e8cd7ea231c673828d0386b1740b8dc"}, + {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b2170c7e0367dde86a2647ed5b6f57394ea7f53545746104c6b09fc1f4223573"}, + {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c16842b846a8d2a145223f520b7e18b57c8f476924bda92aeee3a88d11cfc391"}, + {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:157e89ceb4054029a289fb504c98c6a9fe8010f1680de0201b3eb5dc20aa6d9e"}, + {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:203d236f4c94cd8379d1ea61db2fce20730b4c38d7f1c34506a31b34edc87bdd"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:dc5d1a49d3f8262be192589a4b72f0d03b72dcf46c51ad5852a4fdc67be7b9e4"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:752bf8a74412b9892f4e5b58f2f890a039f57037f52c89a740757ebd807f33ea"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:80080816b4f52a9d886e67f1f96912891074903238fe54f2de8b786f86baded2"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:84433dddea68571a6d6bd4fbf8ff398236031149116a7fff6f777ff95cad3df9"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ab19a2d91963ed9e42b4e8d77cd847ae8381576585bad79dbd0a8837a9f6620a"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:59556bf80a7094d0cfb9f5e50bb2db27fefb75d5138bb16fb052b61b0e0eeeb0"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:27d3ef2252d2e62476389ca8f9b0cf2bbafb082a3b6bfe9d90cbcbb5529ecf7c"}, + {file = "zstandard-0.23.0-cp310-cp310-win32.whl", hash = "sha256:5d41d5e025f1e0bccae4928981e71b2334c60f580bdc8345f824e7c0a4c2a813"}, + {file = "zstandard-0.23.0-cp310-cp310-win_amd64.whl", hash = "sha256:519fbf169dfac1222a76ba8861ef4ac7f0530c35dd79ba5727014613f91613d4"}, + {file = "zstandard-0.23.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:34895a41273ad33347b2fc70e1bff4240556de3c46c6ea430a7ed91f9042aa4e"}, + {file = "zstandard-0.23.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77ea385f7dd5b5676d7fd943292ffa18fbf5c72ba98f7d09fc1fb9e819b34c23"}, + {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:983b6efd649723474f29ed42e1467f90a35a74793437d0bc64a5bf482bedfa0a"}, + {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80a539906390591dd39ebb8d773771dc4db82ace6372c4d41e2d293f8e32b8db"}, + {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:445e4cb5048b04e90ce96a79b4b63140e3f4ab5f662321975679b5f6360b90e2"}, + {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd30d9c67d13d891f2360b2a120186729c111238ac63b43dbd37a5a40670b8ca"}, + {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d20fd853fbb5807c8e84c136c278827b6167ded66c72ec6f9a14b863d809211c"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ed1708dbf4d2e3a1c5c69110ba2b4eb6678262028afd6c6fbcc5a8dac9cda68e"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:be9b5b8659dff1f913039c2feee1aca499cfbc19e98fa12bc85e037c17ec6ca5"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:65308f4b4890aa12d9b6ad9f2844b7ee42c7f7a4fd3390425b242ffc57498f48"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:98da17ce9cbf3bfe4617e836d561e433f871129e3a7ac16d6ef4c680f13a839c"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:8ed7d27cb56b3e058d3cf684d7200703bcae623e1dcc06ed1e18ecda39fee003"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:b69bb4f51daf461b15e7b3db033160937d3ff88303a7bc808c67bbc1eaf98c78"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:034b88913ecc1b097f528e42b539453fa82c3557e414b3de9d5632c80439a473"}, + {file = "zstandard-0.23.0-cp311-cp311-win32.whl", hash = "sha256:f2d4380bf5f62daabd7b751ea2339c1a21d1c9463f1feb7fc2bdcea2c29c3160"}, + {file = "zstandard-0.23.0-cp311-cp311-win_amd64.whl", hash = "sha256:62136da96a973bd2557f06ddd4e8e807f9e13cbb0bfb9cc06cfe6d98ea90dfe0"}, + {file = "zstandard-0.23.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b4567955a6bc1b20e9c31612e615af6b53733491aeaa19a6b3b37f3b65477094"}, + {file = "zstandard-0.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1e172f57cd78c20f13a3415cc8dfe24bf388614324d25539146594c16d78fcc8"}, + {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b0e166f698c5a3e914947388c162be2583e0c638a4703fc6a543e23a88dea3c1"}, + {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12a289832e520c6bd4dcaad68e944b86da3bad0d339ef7989fb7e88f92e96072"}, + {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d50d31bfedd53a928fed6707b15a8dbeef011bb6366297cc435accc888b27c20"}, + {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72c68dda124a1a138340fb62fa21b9bf4848437d9ca60bd35db36f2d3345f373"}, + {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:53dd9d5e3d29f95acd5de6802e909ada8d8d8cfa37a3ac64836f3bc4bc5512db"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:6a41c120c3dbc0d81a8e8adc73312d668cd34acd7725f036992b1b72d22c1772"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:40b33d93c6eddf02d2c19f5773196068d875c41ca25730e8288e9b672897c105"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9206649ec587e6b02bd124fb7799b86cddec350f6f6c14bc82a2b70183e708ba"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76e79bc28a65f467e0409098fa2c4376931fd3207fbeb6b956c7c476d53746dd"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:66b689c107857eceabf2cf3d3fc699c3c0fe8ccd18df2219d978c0283e4c508a"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9c236e635582742fee16603042553d276cca506e824fa2e6489db04039521e90"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a8fffdbd9d1408006baaf02f1068d7dd1f016c6bcb7538682622c556e7b68e35"}, + {file = "zstandard-0.23.0-cp312-cp312-win32.whl", hash = "sha256:dc1d33abb8a0d754ea4763bad944fd965d3d95b5baef6b121c0c9013eaf1907d"}, + {file = "zstandard-0.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:64585e1dba664dc67c7cdabd56c1e5685233fbb1fc1966cfba2a340ec0dfff7b"}, + {file = "zstandard-0.23.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:576856e8594e6649aee06ddbfc738fec6a834f7c85bf7cadd1c53d4a58186ef9"}, + {file = "zstandard-0.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:38302b78a850ff82656beaddeb0bb989a0322a8bbb1bf1ab10c17506681d772a"}, + {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2240ddc86b74966c34554c49d00eaafa8200a18d3a5b6ffbf7da63b11d74ee2"}, + {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ef230a8fd217a2015bc91b74f6b3b7d6522ba48be29ad4ea0ca3a3775bf7dd5"}, + {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:774d45b1fac1461f48698a9d4b5fa19a69d47ece02fa469825b442263f04021f"}, + {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f77fa49079891a4aab203d0b1744acc85577ed16d767b52fc089d83faf8d8ed"}, + {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ac184f87ff521f4840e6ea0b10c0ec90c6b1dcd0bad2f1e4a9a1b4fa177982ea"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c363b53e257246a954ebc7c488304b5592b9c53fbe74d03bc1c64dda153fb847"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e7792606d606c8df5277c32ccb58f29b9b8603bf83b48639b7aedf6df4fe8171"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a0817825b900fcd43ac5d05b8b3079937073d2b1ff9cf89427590718b70dd840"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:9da6bc32faac9a293ddfdcb9108d4b20416219461e4ec64dfea8383cac186690"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:fd7699e8fd9969f455ef2926221e0233f81a2542921471382e77a9e2f2b57f4b"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d477ed829077cd945b01fc3115edd132c47e6540ddcd96ca169facff28173057"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa6ce8b52c5987b3e34d5674b0ab529a4602b632ebab0a93b07bfb4dfc8f8a33"}, + {file = "zstandard-0.23.0-cp313-cp313-win32.whl", hash = "sha256:a9b07268d0c3ca5c170a385a0ab9fb7fdd9f5fd866be004c4ea39e44edce47dd"}, + {file = "zstandard-0.23.0-cp313-cp313-win_amd64.whl", hash = "sha256:f3513916e8c645d0610815c257cbfd3242adfd5c4cfa78be514e5a3ebb42a41b"}, + {file = "zstandard-0.23.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2ef3775758346d9ac6214123887d25c7061c92afe1f2b354f9388e9e4d48acfc"}, + {file = "zstandard-0.23.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4051e406288b8cdbb993798b9a45c59a4896b6ecee2f875424ec10276a895740"}, + {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2d1a054f8f0a191004675755448d12be47fa9bebbcffa3cdf01db19f2d30a54"}, + {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f83fa6cae3fff8e98691248c9320356971b59678a17f20656a9e59cd32cee6d8"}, + {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:32ba3b5ccde2d581b1e6aa952c836a6291e8435d788f656fe5976445865ae045"}, + {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f146f50723defec2975fb7e388ae3a024eb7151542d1599527ec2aa9cacb152"}, + {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1bfe8de1da6d104f15a60d4a8a768288f66aa953bbe00d027398b93fb9680b26"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:29a2bc7c1b09b0af938b7a8343174b987ae021705acabcbae560166567f5a8db"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:61f89436cbfede4bc4e91b4397eaa3e2108ebe96d05e93d6ccc95ab5714be512"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:53ea7cdc96c6eb56e76bb06894bcfb5dfa93b7adcf59d61c6b92674e24e2dd5e"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:a4ae99c57668ca1e78597d8b06d5af837f377f340f4cce993b551b2d7731778d"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:379b378ae694ba78cef921581ebd420c938936a153ded602c4fea612b7eaa90d"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:50a80baba0285386f97ea36239855f6020ce452456605f262b2d33ac35c7770b"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:61062387ad820c654b6a6b5f0b94484fa19515e0c5116faf29f41a6bc91ded6e"}, + {file = "zstandard-0.23.0-cp38-cp38-win32.whl", hash = "sha256:b8c0bd73aeac689beacd4e7667d48c299f61b959475cdbb91e7d3d88d27c56b9"}, + {file = "zstandard-0.23.0-cp38-cp38-win_amd64.whl", hash = "sha256:a05e6d6218461eb1b4771d973728f0133b2a4613a6779995df557f70794fd60f"}, + {file = "zstandard-0.23.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3aa014d55c3af933c1315eb4bb06dd0459661cc0b15cd61077afa6489bec63bb"}, + {file = "zstandard-0.23.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0a7f0804bb3799414af278e9ad51be25edf67f78f916e08afdb983e74161b916"}, + {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb2b1ecfef1e67897d336de3a0e3f52478182d6a47eda86cbd42504c5cbd009a"}, + {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:837bb6764be6919963ef41235fd56a6486b132ea64afe5fafb4cb279ac44f259"}, + {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1516c8c37d3a053b01c1c15b182f3b5f5eef19ced9b930b684a73bad121addf4"}, + {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48ef6a43b1846f6025dde6ed9fee0c24e1149c1c25f7fb0a0585572b2f3adc58"}, + {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11e3bf3c924853a2d5835b24f03eeba7fc9b07d8ca499e247e06ff5676461a15"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2fb4535137de7e244c230e24f9d1ec194f61721c86ebea04e1581d9d06ea1269"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8c24f21fa2af4bb9f2c492a86fe0c34e6d2c63812a839590edaf177b7398f700"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a8c86881813a78a6f4508ef9daf9d4995b8ac2d147dcb1a450448941398091c9"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:fe3b385d996ee0822fd46528d9f0443b880d4d05528fd26a9119a54ec3f91c69"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:82d17e94d735c99621bf8ebf9995f870a6b3e6d14543b99e201ae046dfe7de70"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c7c517d74bea1a6afd39aa612fa025e6b8011982a0897768a2f7c8ab4ebb78a2"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1fd7e0f1cfb70eb2f95a19b472ee7ad6d9a0a992ec0ae53286870c104ca939e5"}, + {file = "zstandard-0.23.0-cp39-cp39-win32.whl", hash = "sha256:43da0f0092281bf501f9c5f6f3b4c975a8a0ea82de49ba3f7100e64d422a1274"}, + {file = "zstandard-0.23.0-cp39-cp39-win_amd64.whl", hash = "sha256:f8346bfa098532bc1fb6c7ef06783e969d87a99dd1d2a5a18a892c1d7a643c58"}, + {file = "zstandard-0.23.0.tar.gz", hash = "sha256:b2d8c62d08e7255f68f7a740bae85b3c9b8e5466baa9cbf7f57f1cde0ac6bc09"}, +] + +[package.dependencies] +cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\""} + +[package.extras] +cffi = ["cffi (>=1.11)"] + [extras] scrapy = ["scrapy"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "4cbc437e3764ddef7e5458c9c3bf56eb5618ff46447c47da69d334365ae4f0d3" +content-hash = "bbc1874d4d52e1ebb6ed1e3050a625757c6bf49875760cd00d9577a773d82bb5" diff --git a/pyproject.toml b/pyproject.toml index 295cf1ff..74777d84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,7 @@ keywords = [ python = "^3.9" apify-client = ">=1.8.1" apify-shared = ">=1.2.1" -crawlee = "~0.4.0" +crawlee = "~0.5.0" cryptography = ">=42.0.0" httpx = ">=0.27.0" lazy-object-proxy = ">=1.10.0" diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 7c84e510..034a4eba 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -13,8 +13,9 @@ from apify_client import ApifyClientAsync from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value -from crawlee import service_container +from crawlee import service_locator from crawlee.events._types import Event, EventMigratingData, EventPersistStateData +from crawlee.storage_clients import MemoryStorageClient from apify._configuration import Configuration from apify._consts import EVENT_LISTENERS_TIMEOUT @@ -71,17 +72,22 @@ def __init__( self._configure_logging = configure_logging self._apify_client = self.new_client() - self._event_manager: EventManager - if self._configuration.is_at_home: - self._event_manager = PlatformEventManager( - config=self._configuration, - persist_state_interval=self._configuration.persist_state_interval, + # We need to keep both local & cloud storage clients because of the `force_cloud` option. + self._local_storage_client = MemoryStorageClient.from_config(config=self.config) + self._cloud_storage_client = ApifyStorageClient.from_config(config=self.config) + + # Set the event manager based on whether the Actor is running on the platform or locally. + self._event_manager = ( + PlatformEventManager( + config=self.config, + persist_state_interval=self.config.persist_state_interval, ) - else: - self._event_manager = LocalEventManager( - system_info_interval=self._configuration.system_info_interval, - persist_state_interval=self._configuration.persist_state_interval, + if self.is_at_home() + else LocalEventManager( + system_info_interval=self.config.system_info_interval, + persist_state_interval=self.config.persist_state_interval, ) + ) self._is_initialized = False @@ -94,9 +100,6 @@ async def __aenter__(self) -> Self: When you exit the `async with` block, the `Actor.exit()` method is called, and if any exception happens while executing the block code, the `Actor.fail` method is called. """ - if self._configure_logging: - _configure_logging(self._configuration) - await self.init() return self @@ -184,18 +187,21 @@ async def init(self) -> None: if self._is_initialized: raise RuntimeError('The Actor was already initialized!') - if self._configuration.token: - service_container.set_cloud_storage_client(ApifyStorageClient(configuration=self._configuration)) + self._is_exiting = False + self._was_final_persist_state_emitted = False - if self._configuration.is_at_home: - service_container.set_default_storage_client_type('cloud') + # Register services in the service locator. + if self.is_at_home(): + service_locator.set_storage_client(self._cloud_storage_client) else: - service_container.set_default_storage_client_type('local') + service_locator.set_storage_client(self._local_storage_client) - service_container.set_event_manager(self._event_manager) + service_locator.set_event_manager(self.event_manager) + service_locator.set_configuration(self.configuration) - self._is_exiting = False - self._was_final_persist_state_emitted = False + # The logging configuration has to be called after all service_locator set methods. + if self._configure_logging: + _configure_logging() self.log.info('Initializing Actor...') self.log.info('System info', extra=get_system_info()) @@ -245,7 +251,6 @@ async def finalize() -> None: await self._event_manager.wait_for_all_listeners_to_complete(timeout=event_listeners_timeout) await self._event_manager.__aexit__(None, None, None) - cast(dict, service_container._services).clear() # noqa: SLF001 await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds()) self._is_initialized = False @@ -349,11 +354,13 @@ async def open_dataset( self._raise_if_not_initialized() self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud) + storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client() + return await Dataset.open( id=id, name=name, configuration=self._configuration, - storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None), + storage_client=storage_client, ) async def open_key_value_store( @@ -381,12 +388,13 @@ async def open_key_value_store( """ self._raise_if_not_initialized() self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud) + storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client() return await KeyValueStore.open( id=id, name=name, configuration=self._configuration, - storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None), + storage_client=storage_client, ) async def open_request_queue( @@ -417,11 +425,13 @@ async def open_request_queue( self._raise_if_not_initialized() self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud) + storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client() + return await RequestQueue.open( id=id, name=name, configuration=self._configuration, - storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None), + storage_client=storage_client, ) async def push_data(self, data: dict | list[dict]) -> None: @@ -963,7 +973,7 @@ async def create_proxy_configuration( password: str | None = None, groups: list[str] | None = None, country_code: str | None = None, - proxy_urls: list[str] | None = None, + proxy_urls: list[str | None] | None = None, new_url_function: _NewUrlFunction | None = None, ) -> ProxyConfiguration | None: """Create a ProxyConfiguration object with the passed proxy configuration. diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 018a6e98..af15ade7 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -1,6 +1,7 @@ from __future__ import annotations from datetime import datetime, timedelta +from logging import getLogger from typing import Annotated, Any from pydantic import AliasChoices, BeforeValidator, Field @@ -12,6 +13,8 @@ from apify._utils import docs_group +logger = getLogger(__name__) + def _transform_to_list(value: Any) -> list[str] | None: if value is None: @@ -353,6 +356,15 @@ class Configuration(CrawleeConfiguration): ), ] = None + @classmethod + def get_global_configuration(cls) -> Configuration: + """Retrieve the global instance of the configuration. + + Mostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()` + instead. + """ + return cls() + # Monkey-patch the base class so that it works with the extended configuration CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore[method-assign] diff --git a/src/apify/_proxy_configuration.py b/src/apify/_proxy_configuration.py index e564706c..6fa64f56 100644 --- a/src/apify/_proxy_configuration.py +++ b/src/apify/_proxy_configuration.py @@ -111,9 +111,9 @@ def __init__( password: str | None = None, groups: list[str] | None = None, country_code: str | None = None, - proxy_urls: list[str] | None = None, + proxy_urls: list[str | None] | None = None, new_url_function: _NewUrlFunction | None = None, - tiered_proxy_urls: list[list[str]] | None = None, + tiered_proxy_urls: list[list[str | None]] | None = None, _actor_config: Configuration | None = None, _apify_client: ApifyClientAsync | None = None, ) -> None: @@ -148,7 +148,7 @@ def __init__( ' "groups" or "country_code".' ) - if proxy_urls and any('apify.com' in url for url in proxy_urls): + if proxy_urls and any('apify.com' in (url or '') for url in proxy_urls): logger.warning( 'Some Apify proxy features may work incorrectly. Please consider setting up Apify properties ' 'instead of `proxy_urls`.\n' diff --git a/src/apify/apify_storage_client/_apify_storage_client.py b/src/apify/apify_storage_client/_apify_storage_client.py index 45689c60..67c6a456 100644 --- a/src/apify/apify_storage_client/_apify_storage_client.py +++ b/src/apify/apify_storage_client/_apify_storage_client.py @@ -1,10 +1,13 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + from typing_extensions import override from apify_client import ApifyClientAsync from crawlee._utils.crypto import crypto_random_object_id -from crawlee.base_storage_client import BaseStorageClient +from crawlee.storage_clients import BaseStorageClient -from apify._configuration import Configuration from apify._utils import docs_group from apify.apify_storage_client._dataset_client import DatasetClient from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient @@ -13,6 +16,9 @@ from apify.apify_storage_client._request_queue_client import RequestQueueClient from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient +if TYPE_CHECKING: + from apify._configuration import Configuration + @docs_group('Classes') class ApifyStorageClient(BaseStorageClient): @@ -29,6 +35,10 @@ def __init__(self, *, configuration: Configuration) -> None: ) self._configuration = configuration + @classmethod + def from_config(cls, config: Configuration) -> ApifyStorageClient: + return cls(configuration=config) + @override def dataset(self, id: str) -> DatasetClient: return DatasetClient(self._apify_client.dataset(id)) diff --git a/src/apify/apify_storage_client/_dataset_client.py b/src/apify/apify_storage_client/_dataset_client.py index 5c5802d2..2a483d71 100644 --- a/src/apify/apify_storage_client/_dataset_client.py +++ b/src/apify/apify_storage_client/_dataset_client.py @@ -4,7 +4,8 @@ from typing_extensions import override -from crawlee.base_storage_client import BaseDatasetClient, DatasetItemsListPage, DatasetMetadata +from crawlee.storage_clients._base import BaseDatasetClient +from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata if TYPE_CHECKING: from collections.abc import AsyncIterator diff --git a/src/apify/apify_storage_client/_dataset_collection_client.py b/src/apify/apify_storage_client/_dataset_collection_client.py index 1a6fb27a..dc1f29c1 100644 --- a/src/apify/apify_storage_client/_dataset_collection_client.py +++ b/src/apify/apify_storage_client/_dataset_collection_client.py @@ -4,7 +4,8 @@ from typing_extensions import override -from crawlee.base_storage_client import BaseDatasetCollectionClient, DatasetListPage, DatasetMetadata +from crawlee.storage_clients._base import BaseDatasetCollectionClient +from crawlee.storage_clients.models import DatasetListPage, DatasetMetadata if TYPE_CHECKING: from apify_client.clients import DatasetCollectionClientAsync diff --git a/src/apify/apify_storage_client/_key_value_store_client.py b/src/apify/apify_storage_client/_key_value_store_client.py index 48ab72cb..601f0b89 100644 --- a/src/apify/apify_storage_client/_key_value_store_client.py +++ b/src/apify/apify_storage_client/_key_value_store_client.py @@ -5,12 +5,8 @@ from typing_extensions import override -from crawlee.base_storage_client import ( - BaseKeyValueStoreClient, - KeyValueStoreListKeysPage, - KeyValueStoreMetadata, - KeyValueStoreRecord, -) +from crawlee.storage_clients._base import BaseKeyValueStoreClient +from crawlee.storage_clients.models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord if TYPE_CHECKING: from collections.abc import AsyncIterator diff --git a/src/apify/apify_storage_client/_key_value_store_collection_client.py b/src/apify/apify_storage_client/_key_value_store_collection_client.py index cf22821e..42bdd96c 100644 --- a/src/apify/apify_storage_client/_key_value_store_collection_client.py +++ b/src/apify/apify_storage_client/_key_value_store_collection_client.py @@ -4,7 +4,8 @@ from typing_extensions import override -from crawlee.base_storage_client import BaseKeyValueStoreCollectionClient, KeyValueStoreListPage, KeyValueStoreMetadata +from crawlee.storage_clients._base import BaseKeyValueStoreCollectionClient +from crawlee.storage_clients.models import KeyValueStoreListPage, KeyValueStoreMetadata if TYPE_CHECKING: from apify_client.clients import KeyValueStoreCollectionClientAsync diff --git a/src/apify/apify_storage_client/_request_queue_client.py b/src/apify/apify_storage_client/_request_queue_client.py index 2cdbe58d..e86809b4 100644 --- a/src/apify/apify_storage_client/_request_queue_client.py +++ b/src/apify/apify_storage_client/_request_queue_client.py @@ -5,8 +5,8 @@ from typing_extensions import override from crawlee import Request -from crawlee.base_storage_client import ( - BaseRequestQueueClient, +from crawlee.storage_clients._base import BaseRequestQueueClient +from crawlee.storage_clients.models import ( BatchRequestsOperationResponse, ProcessedRequest, ProlongRequestLockResponse, diff --git a/src/apify/apify_storage_client/_request_queue_collection_client.py b/src/apify/apify_storage_client/_request_queue_collection_client.py index 50aad1aa..ec9e5eb0 100644 --- a/src/apify/apify_storage_client/_request_queue_collection_client.py +++ b/src/apify/apify_storage_client/_request_queue_collection_client.py @@ -4,7 +4,8 @@ from typing_extensions import override -from crawlee.base_storage_client import BaseRequestQueueCollectionClient, RequestQueueListPage, RequestQueueMetadata +from crawlee.storage_clients._base import BaseRequestQueueCollectionClient +from crawlee.storage_clients.models import RequestQueueListPage, RequestQueueMetadata if TYPE_CHECKING: from apify_client.clients import RequestQueueCollectionClientAsync diff --git a/src/apify/log.py b/src/apify/log.py index 698474f6..970a37a6 100644 --- a/src/apify/log.py +++ b/src/apify/log.py @@ -1,14 +1,10 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING from apify_shared.utils import ignore_docs from crawlee._log_config import CrawleeLogFormatter, configure_logger, get_configured_log_level -if TYPE_CHECKING: - from apify import Configuration - # Name of the logger used throughout the library (resolves to 'apify') logger_name = __name__.split('.')[0] @@ -21,11 +17,11 @@ class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 (Inherited from pare pass -def _configure_logging(configuration: Configuration) -> None: +def _configure_logging() -> None: apify_client_logger = logging.getLogger('apify_client') - configure_logger(apify_client_logger, configuration, remove_old_handlers=True) + configure_logger(apify_client_logger, remove_old_handlers=True) - level = get_configured_log_level(configuration) + level = get_configured_log_level() # Keep apify_client logger quiet unless debug logging is requested if level > logging.DEBUG: @@ -42,4 +38,4 @@ def _configure_logging(configuration: Configuration) -> None: # Use configured log level for apify logger apify_logger = logging.getLogger('apify') - configure_logger(apify_logger, configuration, remove_old_handlers=True) + configure_logger(apify_logger, remove_old_handlers=True) diff --git a/src/apify/storages/_request_list.py b/src/apify/storages/_request_list.py index 2dd381fa..7cc2828b 100644 --- a/src/apify/storages/_request_list.py +++ b/src/apify/storages/_request_list.py @@ -11,7 +11,7 @@ from crawlee import Request from crawlee._types import HttpMethod from crawlee.http_clients import BaseHttpClient, HttpxHttpClient -from crawlee.storages import RequestList as CrawleeRequestList +from crawlee.request_loaders import RequestList as CrawleeRequestList from apify._utils import docs_group diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 060644bd..9a74924a 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -7,13 +7,15 @@ import sys import textwrap from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Protocol, cast +from typing import TYPE_CHECKING, Any, Callable, Protocol import pytest from filelock import FileLock from apify_client import ApifyClientAsync -from apify_shared.consts import ActorJobStatus, ActorSourceType +from apify_shared.consts import ActorJobStatus, ActorSourceType, ApifyEnvVars +from crawlee import service_locator +from crawlee.storages import _creation_management import apify._actor from ._utils import generate_unique_resource_name @@ -29,19 +31,67 @@ _SDK_ROOT_PATH = Path(__file__).parent.parent.parent.resolve() -@pytest.fixture(autouse=True) -def _reset_and_patch_default_instances() -> None: - """Reset the used singletons and patch the default storage client with a temporary directory. +@pytest.fixture +def prepare_test_env(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Callable[[], None]: + """Prepare the testing environment by resetting the global state before each test. + + This fixture ensures that the global state of the package is reset to a known baseline before each test runs. + It also configures a temporary storage directory for test isolation. + + Args: + monkeypatch: Test utility provided by pytest for patching. + tmp_path: A unique temporary directory path provided by pytest for test isolation. - To isolate the tests, we need to reset the used singletons before each test case. We also patch the default - storage client with a tmp_path. + Returns: + A callable that prepares the test environment. """ - from crawlee import service_container - cast(dict, service_container._services).clear() - delattr(apify._actor.Actor, '__wrapped__') + def _prepare_test_env() -> None: + delattr(apify._actor.Actor, '__wrapped__') + + # Set the environment variable for the local storage directory to the temporary path. + monkeypatch.setenv(ApifyEnvVars.LOCAL_STORAGE_DIR, str(tmp_path)) + + # Reset the flags in the service locator to indicate that no services are explicitly set. This ensures + # a clean state, as services might have been set during a previous test and not reset properly. + service_locator._configuration_was_retrieved = False + service_locator._storage_client_was_retrieved = False + service_locator._event_manager_was_retrieved = False + + # Reset the services in the service locator. + service_locator._configuration = None + service_locator._event_manager = None + service_locator._storage_client = None + + # Clear creation-related caches to ensure no state is carried over between tests. + monkeypatch.setattr(_creation_management, '_cache_dataset_by_id', {}) + monkeypatch.setattr(_creation_management, '_cache_dataset_by_name', {}) + monkeypatch.setattr(_creation_management, '_cache_kvs_by_id', {}) + monkeypatch.setattr(_creation_management, '_cache_kvs_by_name', {}) + monkeypatch.setattr(_creation_management, '_cache_rq_by_id', {}) + monkeypatch.setattr(_creation_management, '_cache_rq_by_name', {}) + + # Verify that the test environment was set up correctly. + assert os.environ.get(ApifyEnvVars.LOCAL_STORAGE_DIR) == str(tmp_path) + assert service_locator._configuration_was_retrieved is False + assert service_locator._storage_client_was_retrieved is False + assert service_locator._event_manager_was_retrieved is False + + return _prepare_test_env + + +@pytest.fixture(autouse=True) +def _isolate_test_environment(prepare_test_env: Callable[[], None]) -> None: + """Isolate the testing environment by resetting global state before and after each test. + + This fixture ensures that each test starts with a clean slate and that any modifications during the test + do not affect subsequent tests. It runs automatically for all tests. + + Args: + prepare_test_env: Fixture to prepare the environment before each test. + """ - # TODO: StorageClientManager local storage client purge # noqa: TD003 + prepare_test_env() @pytest.fixture diff --git a/tests/unit/actor/test_actor_dataset.py b/tests/unit/actor/test_actor_dataset.py index d666cc20..ef6282bb 100644 --- a/tests/unit/actor/test_actor_dataset.py +++ b/tests/unit/actor/test_actor_dataset.py @@ -9,7 +9,7 @@ from apify import Actor if TYPE_CHECKING: - from crawlee.memory_storage_client import MemoryStorageClient + from crawlee.storage_clients import MemoryStorageClient # NOTE: We only test the dataset methods available on Actor class/instance. # Actual tests for the implementations are in storages/. diff --git a/tests/unit/actor/test_actor_key_value_store.py b/tests/unit/actor/test_actor_key_value_store.py index 42d6b2d4..821065e1 100644 --- a/tests/unit/actor/test_actor_key_value_store.py +++ b/tests/unit/actor/test_actor_key_value_store.py @@ -13,7 +13,7 @@ from apify._crypto import public_encrypt if TYPE_CHECKING: - from crawlee.memory_storage_client import MemoryStorageClient + from crawlee.storage_clients import MemoryStorageClient # NOTE: We only test the key-value store methods available on Actor class/instance. diff --git a/tests/unit/actor/test_actor_lifecycle.py b/tests/unit/actor/test_actor_lifecycle.py index 607e1682..33af45e6 100644 --- a/tests/unit/actor/test_actor_lifecycle.py +++ b/tests/unit/actor/test_actor_lifecycle.py @@ -35,18 +35,25 @@ async def test_actor_init() -> None: assert my_actor._is_initialized is False -async def test_double_init_raises_error() -> None: - my_actor = _ActorType() +async def test_double_init_raises_error(prepare_test_env: Callable) -> None: + async with Actor: + assert Actor._is_initialized + with pytest.raises(RuntimeError): + await Actor.init() - await my_actor.init() - with pytest.raises(RuntimeError): - await my_actor.init() - await my_actor.exit() + prepare_test_env() - await Actor.init() - with pytest.raises(RuntimeError): - await Actor.init() - await Actor.exit() + async with Actor() as actor: + assert actor._is_initialized + with pytest.raises(RuntimeError): + await actor.init() + + prepare_test_env() + + async with _ActorType() as actor: + assert actor._is_initialized + with pytest.raises(RuntimeError): + await actor.init() async def test_actor_exits_cleanly_with_events(monkeypatch: pytest.MonkeyPatch) -> None: diff --git a/tests/unit/actor/test_actor_non_default_instance.py b/tests/unit/actor/test_actor_non_default_instance.py index 68e380aa..6a51be23 100644 --- a/tests/unit/actor/test_actor_non_default_instance.py +++ b/tests/unit/actor/test_actor_non_default_instance.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from datetime import timedelta from apify import Actor, Configuration diff --git a/tests/unit/actor/test_request_list.py b/tests/unit/actor/test_request_list.py index 4f4c75ac..9efcdce7 100644 --- a/tests/unit/actor/test_request_list.py +++ b/tests/unit/actor/test_request_list.py @@ -14,20 +14,34 @@ from apify.storages._request_list import URL_NO_COMMAS_REGEX, RequestList -@pytest.mark.parametrize('request_method', get_args(HttpMethod)) @pytest.mark.parametrize( - 'optional_input', - [ - {}, - { - 'payload': 'some payload', - 'userData': {'some key': 'some value'}, - 'headers': {'h1': 'v1', 'h2': 'v2'}, - }, + argnames='request_method', + argvalues=[ + pytest.param( + method, + id=str(method), + ) + for method in get_args(HttpMethod) ], - ids=['minimal', 'all_options'], ) -async def test_request_list_open_request_types(request_method: HttpMethod, optional_input: dict[str, Any]) -> None: +@pytest.mark.parametrize( + argnames='optional_input', + argvalues=[ + pytest.param({}, id='minimal'), + pytest.param( + { + 'payload': 'some payload', + 'userData': {'some key': 'some value'}, + 'headers': {'h1': 'v1', 'h2': 'v2'}, + }, + id='all_options', + ), + ], +) +async def test_request_list_open_request_types( + request_method: HttpMethod, + optional_input: dict[str, Any], +) -> None: """Test proper request list generation from both minimal and full inputs for all method types for simple input.""" minimal_request_dict_input = { 'url': 'https://www.abc.com', @@ -37,9 +51,10 @@ async def test_request_list_open_request_types(request_method: HttpMethod, optio request_list = await RequestList.open(request_list_sources_input=[request_dict_input]) assert not await request_list.is_empty() + request = await request_list.fetch_next_request() assert request is not None - assert await request_list.is_empty() + assert await request_list.is_empty(), 'Request list should be empty after fetching all requests' assert request.method == request_dict_input['method'] assert request.url == request_dict_input['url'] @@ -164,20 +179,21 @@ async def test_request_list_open_name() -> None: @pytest.mark.parametrize( - 'true_positive', - [ - 'http://www.something.com', - 'https://www.something.net', - 'http://nowww.cz', - 'https://with-hypen.com', - 'http://number1.com', - 'http://www.number.123.abc', - 'http://many.dots.com', - 'http://a.com', - 'http://www.something.com/somethignelse' 'http://www.something.com/somethignelse.txt', - 'http://non-english-chars-áíéåü.com', - 'http://www.port.com:1234', - 'http://username:password@something.else.com', + argnames='true_positive', + argvalues=[ + pytest.param('http://www.something.com', id='standard_http_with_www'), + pytest.param('https://www.something.net', id='standard_https_with_www'), + pytest.param('http://nowww.cz', id='http_no_www'), + pytest.param('https://with-hypen.com', id='https_with_hyphen'), + pytest.param('http://number1.com', id='http_with_number_in_domain'), + pytest.param('http://www.number.123.abc', id='http_with_subdomains_and_numbers'), + pytest.param('http://many.dots.com', id='http_with_multiple_subdomains'), + pytest.param('http://a.com', id='http_short_domain'), + pytest.param('http://www.something.com/somethignelse', id='http_with_path_no_extension'), + pytest.param('http://www.something.com/somethignelse.txt', id='http_with_path_and_extension'), + pytest.param('http://non-english-chars-áíéåü.com', id='http_with_non_english_chars'), + pytest.param('http://www.port.com:1234', id='http_with_port'), + pytest.param('http://username:password@something.else.com', id='http_with_authentication'), ], ) def test_url_no_commas_regex_true_positives(true_positive: str) -> None: @@ -188,14 +204,14 @@ def test_url_no_commas_regex_true_positives(true_positive: str) -> None: @pytest.mark.parametrize( - 'false_positive', - [ - 'http://www.a', - 'http://a', - 'http://a.a', - 'http://123.456', - 'www.something.com', - 'http:www.something.com', + argnames='false_positive', + argvalues=[ + pytest.param('http://www.a', id='invalid_domain_single_level'), + pytest.param('http://a', id='invalid_domain_no_tld'), + pytest.param('http://a.a', id='invalid_domain_short_tld'), + pytest.param('http://123.456', id='invalid_numeric_domain'), + pytest.param('www.something.com', id='missing_protocol'), + pytest.param('http:www.something.com', id='missing_slashes'), ], ) def test_url_no_commas_regex_false_positives(false_positive: str) -> None: diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 1d7b7660..9292cf54 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -2,16 +2,19 @@ import asyncio import inspect +import os from collections import defaultdict from copy import deepcopy -from typing import TYPE_CHECKING, Any, Callable, cast, get_type_hints +from typing import TYPE_CHECKING, Any, Callable, get_type_hints import pytest -from apify_client.client import ApifyClientAsync +from apify_client import ApifyClientAsync from apify_shared.consts import ApifyEnvVars +from crawlee import service_locator from crawlee.configuration import Configuration as CrawleeConfiguration -from crawlee.memory_storage_client import MemoryStorageClient +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storages import _creation_management import apify._actor @@ -20,45 +23,66 @@ @pytest.fixture -def reset_default_instances() -> Callable[[], None]: - def reset() -> None: - from crawlee.storages._creation_management import ( - _cache_dataset_by_id, - _cache_dataset_by_name, - _cache_kvs_by_id, - _cache_kvs_by_name, - _cache_rq_by_id, - _cache_rq_by_name, - ) - - _cache_dataset_by_id.clear() - _cache_dataset_by_name.clear() - _cache_kvs_by_id.clear() - _cache_kvs_by_name.clear() - _cache_rq_by_id.clear() - _cache_rq_by_name.clear() - - from crawlee import service_container - - cast(dict, service_container._services).clear() +def prepare_test_env(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Callable[[], None]: + """Prepare the testing environment by resetting the global state before each test. + + This fixture ensures that the global state of the package is reset to a known baseline before each test runs. + It also configures a temporary storage directory for test isolation. + + Args: + monkeypatch: Test utility provided by pytest for patching. + tmp_path: A unique temporary directory path provided by pytest for test isolation. + + Returns: + A callable that prepares the test environment. + """ + + def _prepare_test_env() -> None: delattr(apify._actor.Actor, '__wrapped__') - # TODO: local storage client purge # noqa: TD003 - return reset + # Set the environment variable for the local storage directory to the temporary path. + monkeypatch.setenv(ApifyEnvVars.LOCAL_STORAGE_DIR, str(tmp_path)) + + # Reset the flags in the service locator to indicate that no services are explicitly set. This ensures + # a clean state, as services might have been set during a previous test and not reset properly. + service_locator._configuration_was_retrieved = False + service_locator._storage_client_was_retrieved = False + service_locator._event_manager_was_retrieved = False + + # Reset the services in the service locator. + service_locator._configuration = None + service_locator._event_manager = None + service_locator._storage_client = None + + # Clear creation-related caches to ensure no state is carried over between tests. + monkeypatch.setattr(_creation_management, '_cache_dataset_by_id', {}) + monkeypatch.setattr(_creation_management, '_cache_dataset_by_name', {}) + monkeypatch.setattr(_creation_management, '_cache_kvs_by_id', {}) + monkeypatch.setattr(_creation_management, '_cache_kvs_by_name', {}) + monkeypatch.setattr(_creation_management, '_cache_rq_by_id', {}) + monkeypatch.setattr(_creation_management, '_cache_rq_by_name', {}) + + # Verify that the test environment was set up correctly. + assert os.environ.get(ApifyEnvVars.LOCAL_STORAGE_DIR) == str(tmp_path) + assert service_locator._configuration_was_retrieved is False + assert service_locator._storage_client_was_retrieved is False + assert service_locator._event_manager_was_retrieved is False + + return _prepare_test_env -# To isolate the tests, we need to reset the used singletons before each test case -# We also set the MemoryStorageClient to use a temp path @pytest.fixture(autouse=True) -def _reset_and_patch_default_instances( - monkeypatch: pytest.MonkeyPatch, - tmp_path: Path, - reset_default_instances: Callable[[], None], -) -> None: - # This forces the MemoryStorageClient to use tmp_path for its storage dir - monkeypatch.setenv(ApifyEnvVars.LOCAL_STORAGE_DIR, str(tmp_path)) +def _isolate_test_environment(prepare_test_env: Callable[[], None]) -> None: + """Isolate the testing environment by resetting global state before and after each test. + + This fixture ensures that each test starts with a clean slate and that any modifications during the test + do not affect subsequent tests. It runs automatically for all tests. + + Args: + prepare_test_env: Fixture to prepare the environment before each test. + """ - reset_default_instances() + prepare_test_env() # This class is used to patch the ApifyClientAsync methods to return a fixed value or be replaced with another method. @@ -179,4 +203,4 @@ def memory_storage_client() -> MemoryStorageClient: configuration.persist_storage = True configuration.write_metadata = True - return MemoryStorageClient(configuration) + return MemoryStorageClient.from_config(configuration) diff --git a/tests/unit/test_proxy_configuration.py b/tests/unit/test_proxy_configuration.py index fa2fd53b..57450897 100644 --- a/tests/unit/test_proxy_configuration.py +++ b/tests/unit/test_proxy_configuration.py @@ -153,7 +153,7 @@ async def test_new_url_with_session_ids() -> None: async def test_rotating_custom_urls() -> None: - proxy_urls = ['http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333'] + proxy_urls: list[str | None] = ['http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333'] proxy_configuration = ProxyConfiguration(proxy_urls=proxy_urls) assert await proxy_configuration.new_url() == proxy_urls[0] @@ -166,7 +166,7 @@ async def test_rotating_custom_urls() -> None: async def test_rotating_custom_urls_with_sessions() -> None: sessions = ['sesssion_01', 'sesssion_02', 'sesssion_03', 'sesssion_04', 'sesssion_05', 'sesssion_06'] - proxy_urls = ['http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333'] + proxy_urls: list[str | None] = ['http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333'] proxy_configuration = ProxyConfiguration(proxy_urls=proxy_urls) @@ -239,18 +239,14 @@ def custom_new_url_function(session_id: str | None = None, request: Any = None) async def test_url_reference_not_shared_between_instances() -> None: - urls = [ + proxy_urls: list[str | None] = [ 'http://proxy-example-1.com:8000', 'http://proxy-example-2.com:8000', ] - proxy_configuration_1 = ProxyConfiguration( - proxy_urls=urls, - ) + proxy_configuration_1 = ProxyConfiguration(proxy_urls=proxy_urls) - urls.append('http://proxy-example-3.com:8000') - proxy_configuration_2 = ProxyConfiguration( - proxy_urls=urls, - ) + proxy_urls.append('http://proxy-example-3.com:8000') + proxy_configuration_2 = ProxyConfiguration(proxy_urls=proxy_urls) assert proxy_configuration_1 is not None assert proxy_configuration_2 is not None @@ -296,7 +292,7 @@ async def test_new_proxy_info_basic_construction() -> None: async def test_new_proxy_info_rotating_urls() -> None: - proxy_urls = ['http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333'] + proxy_urls: list[str | None] = ['http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333'] proxy_configuration = ProxyConfiguration(proxy_urls=proxy_urls) proxy_info = await proxy_configuration.new_proxy_info() @@ -326,7 +322,7 @@ async def test_new_proxy_info_rotating_urls() -> None: async def test_new_proxy_info_rotating_urls_with_sessions() -> None: sessions = ['sesssion_01', 'sesssion_02', 'sesssion_03', 'sesssion_04', 'sesssion_05', 'sesssion_06'] - proxy_urls = ['http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333'] + proxy_urls: list[str | None] = ['http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333'] proxy_configuration = ProxyConfiguration(proxy_urls=proxy_urls)