Skip to content

Commit 62b7c70

Browse files
authored
fix: save context state in result for AdaptivePlaywrightCrawler after isolated processing in SubCrawler (#1488)
### Description - save context state in result for `AdaptivePlaywrightCrawler` after isolated processing in `SubCrawler` ### Issues - Closes: #1483
1 parent 36446a7 commit 62b7c70

File tree

2 files changed

+111
-7
lines changed

2 files changed

+111
-7
lines changed

src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ async def get_input_state(
315315
),
316316
logger=self._logger,
317317
)
318-
return SubCrawlerRun(result=result)
318+
return SubCrawlerRun(result=result, run_context=context_linked_to_result)
319319
except Exception as e:
320320
return SubCrawlerRun(exception=e)
321321

@@ -371,7 +371,8 @@ async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
371371
self.track_http_only_request_handler_runs()
372372

373373
static_run = await self._crawl_one(rendering_type='static', context=context)
374-
if static_run.result and self.result_checker(static_run.result):
374+
if static_run.result and static_run.run_context and self.result_checker(static_run.result):
375+
self._update_context_from_copy(context, static_run.run_context)
375376
self._context_result_map[context] = static_run.result
376377
return
377378
if static_run.exception:
@@ -402,13 +403,10 @@ async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
402403
if pw_run.exception is not None:
403404
raise pw_run.exception
404405

405-
if pw_run.result:
406-
self._context_result_map[context] = pw_run.result
407-
406+
if pw_run.result and pw_run.run_context:
408407
if should_detect_rendering_type:
409408
detection_result: RenderingType
410409
static_run = await self._crawl_one('static', context=context, state=old_state_copy)
411-
412410
if static_run.result and self.result_comparator(static_run.result, pw_run.result):
413411
detection_result = 'static'
414412
else:
@@ -417,6 +415,9 @@ async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
417415
context.log.debug(f'Detected rendering type {detection_result} for {context.request.url}')
418416
self.rendering_type_predictor.store_result(context.request, detection_result)
419417

418+
self._update_context_from_copy(context, pw_run.run_context)
419+
self._context_result_map[context] = pw_run.result
420+
420421
def pre_navigation_hook(
421422
self,
422423
hook: Callable[[AdaptivePlaywrightPreNavCrawlingContext], Awaitable[None]] | None = None,
@@ -451,8 +452,32 @@ def track_browser_request_handler_runs(self) -> None:
451452
def track_rendering_type_mispredictions(self) -> None:
452453
self.statistics.state.rendering_type_mispredictions += 1
453454

455+
def _update_context_from_copy(self, context: BasicCrawlingContext, context_copy: BasicCrawlingContext) -> None:
456+
"""Update mutable fields of `context` from `context_copy`.
457+
458+
Uses object.__setattr__ to bypass frozen dataclass restrictions,
459+
allowing state synchronization after isolated crawler execution.
460+
"""
461+
updating_attributes = {
462+
'request': ('headers', 'user_data'),
463+
'session': ('_user_data', '_usage_count', '_error_score', '_cookies'),
464+
}
465+
466+
for attr, sub_attrs in updating_attributes.items():
467+
original_sub_obj = getattr(context, attr)
468+
copy_sub_obj = getattr(context_copy, attr)
469+
470+
# Check that both sub objects are not None
471+
if original_sub_obj is None or copy_sub_obj is None:
472+
continue
473+
474+
for sub_attr in sub_attrs:
475+
new_value = getattr(copy_sub_obj, sub_attr)
476+
object.__setattr__(original_sub_obj, sub_attr, new_value)
477+
454478

455479
@dataclass(frozen=True)
456480
class SubCrawlerRun:
457481
result: RequestHandlerRunResult | None = None
458482
exception: Exception | None = None
483+
run_context: BasicCrawlingContext | None = None

tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,10 @@
2929
from crawlee.crawlers._adaptive_playwright._adaptive_playwright_crawling_context import (
3030
AdaptiveContextError,
3131
)
32+
from crawlee.sessions import SessionPool
3233
from crawlee.statistics import Statistics
3334
from crawlee.storage_clients import SqlStorageClient
34-
from crawlee.storages import KeyValueStore
35+
from crawlee.storages import KeyValueStore, RequestQueue
3536

3637
if TYPE_CHECKING:
3738
from collections.abc import AsyncGenerator, Iterator
@@ -730,6 +731,84 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
730731
mocked_h3_handler.assert_called_once_with(None)
731732

732733

734+
@pytest.mark.parametrize(
735+
'test_input',
736+
[
737+
pytest.param(
738+
TestInput(
739+
expected_pw_count=0,
740+
expected_static_count=2,
741+
rendering_types=cycle(['static']),
742+
detection_probability_recommendation=cycle([0]),
743+
),
744+
id='Static only',
745+
),
746+
pytest.param(
747+
TestInput(
748+
expected_pw_count=2,
749+
expected_static_count=0,
750+
rendering_types=cycle(['client only']),
751+
detection_probability_recommendation=cycle([0]),
752+
),
753+
id='Client only',
754+
),
755+
pytest.param(
756+
TestInput(
757+
expected_pw_count=2,
758+
expected_static_count=2,
759+
rendering_types=cycle(['static', 'client only']),
760+
detection_probability_recommendation=cycle([1]),
761+
),
762+
id='Enforced rendering type detection',
763+
),
764+
],
765+
)
766+
async def test_change_context_state_after_handling(test_input: TestInput, server_url: URL) -> None:
767+
"""Test that context state is saved after handling the request."""
768+
predictor = _SimpleRenderingTypePredictor(
769+
rendering_types=test_input.rendering_types,
770+
detection_probability_recommendation=test_input.detection_probability_recommendation,
771+
)
772+
773+
request_queue = await RequestQueue.open(name='state-test')
774+
used_session_id = None
775+
776+
async with SessionPool() as session_pool:
777+
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
778+
rendering_type_predictor=predictor,
779+
session_pool=session_pool,
780+
request_manager=request_queue,
781+
)
782+
783+
@crawler.router.default_handler
784+
async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
785+
nonlocal used_session_id
786+
787+
if context.session is not None:
788+
used_session_id = context.session.id
789+
context.session.user_data['session_state'] = True
790+
791+
if isinstance(context.request.user_data['request_state'], list):
792+
context.request.user_data['request_state'].append('handler')
793+
794+
request = Request.from_url(str(server_url), user_data={'request_state': ['initial']})
795+
796+
await crawler.run([request])
797+
798+
assert used_session_id is not None
799+
800+
session = await session_pool.get_session_by_id(used_session_id)
801+
check_request = await request_queue.get_request(request.unique_key)
802+
803+
assert session is not None
804+
assert check_request is not None
805+
assert session.user_data.get('session_state') is True
806+
# Check that request user data was updated in the handler and only onse.
807+
assert check_request.user_data.get('request_state') == ['initial', 'handler']
808+
809+
await request_queue.drop()
810+
811+
733812
async def test_adaptive_playwright_crawler_with_sql_storage(test_urls: list[str], tmp_path: Path) -> None:
734813
"""Tests that AdaptivePlaywrightCrawler can be initialized with SqlStorageClient."""
735814
storage_dir = tmp_path / 'test_table.db'

0 commit comments

Comments
 (0)