From ec310ca7ee6f2b4a7091a75d7f46117aeb374a9e Mon Sep 17 00:00:00 2001 From: otsch Date: Thu, 17 Oct 2024 23:56:02 +0200 Subject: [PATCH 1/3] Post Browser Navigate Hooks Adds post browser navigate hooks that are passed from `Http` steps to the browser loader helper and are executed after the headless browser navigated to the specified URL. They are called with the chrome-php `Page` object as argument, so you can run things on that page before getting the HTML and returning the response. --- CHANGELOG.md | 4 ++ .../Http/HeadlessBrowserLoaderHelper.php | 34 ++++++++++++++++ src/Steps/Loading/HttpBase.php | 23 ++++++++++- .../Http/HeadlessBrowserLoaderHelperTest.php | 40 +++++++++++++++++++ .../_Integration/Http/HeadlessBrowserTest.php | 28 +++++++++++++ tests/_Integration/Server.php | 4 ++ tests/_Integration/_Server/SetCookieJs.php | 1 + .../_Server/SetDelayedCookieJs.php | 10 +++++ 8 files changed, 142 insertions(+), 2 deletions(-) create mode 100644 tests/_Integration/_Server/SetDelayedCookieJs.php diff --git a/CHANGELOG.md b/CHANGELOG.md index e74bf49..213b89b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [2.1.0] - 2024-10-17 +### Added +* The new `postBrowserNavigateHook()` method in the `Http` step classes, which allows to define callback functions that are triggered after the headless browser navigated to the specified URL. They are called with the chrome-php `Page` object as argument. + ## [2.0.1] - 2024-10-15 ### Fixed * Issue with the `afterLoad` hook of the `HttpLoader`, introduced in v2. Calling the hook was commented out, which slipped through because the test case was faulty. diff --git a/src/Loader/Http/HeadlessBrowserLoaderHelper.php b/src/Loader/Http/HeadlessBrowserLoaderHelper.php index 0a172d4..04db8ab 100644 --- a/src/Loader/Http/HeadlessBrowserLoaderHelper.php +++ b/src/Loader/Http/HeadlessBrowserLoaderHelper.php @@ -2,6 +2,7 @@ namespace Crwlr\Crawler\Loader\Http; +use Closure; use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest; use Crwlr\Crawler\Loader\Http\Politeness\Throttler; use Exception; @@ -42,8 +43,28 @@ class HeadlessBrowserLoaderHelper protected int $timeout = 30_000; + /** + * @var Closure[] + */ + protected array $tempPostNavigateHooks = []; + public function __construct(private ?BrowserFactory $browserFactory = null) {} + /** + * Set temporary post navigate hooks + * + * They will be executed after the next call to navigateToPageAndGetRespondedRequest() + * and forgotten afterward. + * + * @param Closure[] $hooks + */ + public function setTempPostNavigateHooks(array $hooks): static + { + $this->tempPostNavigateHooks = $hooks; + + return $this; + } + /** * @throws OperationTimedOut * @throws CommunicationException @@ -81,6 +102,8 @@ function ($params) use (&$statusCode, &$responseHeaders) { $this->navigate($request->getUri()->__toString()); + $this->callPostNavigateHooks(); + $throttler->trackRequestEndFor($request->getUri()); $html = $this->page?->getHtml(); @@ -196,6 +219,17 @@ protected function navigate(string $url): void } } + protected function callPostNavigateHooks(): void + { + if (!empty($this->tempPostNavigateHooks)) { + foreach ($this->tempPostNavigateHooks as $hook) { + $hook->call($this, $this->page); + } + } + + $this->tempPostNavigateHooks = []; + } + /** * @throws Exception */ diff --git a/src/Steps/Loading/HttpBase.php b/src/Steps/Loading/HttpBase.php index 763c431..cd9355a 100644 --- a/src/Steps/Loading/HttpBase.php +++ b/src/Steps/Loading/HttpBase.php @@ -2,6 +2,7 @@ namespace Crwlr\Crawler\Steps\Loading; +use Closure; use Crwlr\Crawler\Loader\Http\Exceptions\LoadingException; use Crwlr\Crawler\Loader\Http\HttpLoader; use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest; @@ -42,6 +43,11 @@ abstract class HttpBase extends Step */ protected ?array $inputHeaders = null; + /** + * @var Closure[] + */ + protected array $postBrowserNavigateHooks = []; + /** * @param string $method * @param array $headers @@ -127,6 +133,13 @@ public function useInputKeyAsHeaders(string $key): static return $this; } + public function postBrowserNavigateHook(Closure $callback): static + { + $this->postBrowserNavigateHooks[] = $callback; + + return $this; + } + /** * @return UriInterface|UriInterface[] * @throws InvalidArgumentException @@ -185,10 +198,16 @@ protected function getRequestFromInputUri(UriInterface $uri): RequestInterface */ protected function getResponseFromRequest(RequestInterface $request): ?RespondedRequest { + $loader = $this->getLoader(); + + if (!empty($this->postBrowserNavigateHooks) && $loader->usesHeadlessBrowser()) { + $loader->browser()->setTempPostNavigateHooks($this->postBrowserNavigateHooks); + } + if ($this->stopOnErrorResponse) { - $response = $this->getLoader()->loadOrFail($request); + $response = $loader->loadOrFail($request); } else { - $response = $this->getLoader()->load($request); + $response = $loader->load($request); } if ($response !== null && ($response->response->getStatusCode() < 400 || $this->yieldErrorResponses)) { diff --git a/tests/Loader/Http/HeadlessBrowserLoaderHelperTest.php b/tests/Loader/Http/HeadlessBrowserLoaderHelperTest.php index d626ed1..a192dba 100644 --- a/tests/Loader/Http/HeadlessBrowserLoaderHelperTest.php +++ b/tests/Loader/Http/HeadlessBrowserLoaderHelperTest.php @@ -133,3 +133,43 @@ function helper_setUpHeadlessChromeMocks( expect($invadedBrowserFactory->chromeBinary)->toBe($chromeExecutable); }); + +it('calls the temporary post navigate hooks once', function () { + $browserFactoryMock = helper_setUpHeadlessChromeMocks(); + + $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock); + + $hook1Called = $hook2Called = $hook3Called = false; + + $helper->setTempPostNavigateHooks([ + function (Page $page) use (& $hook1Called) { + $hook1Called = true; + }, + function (Page $page) use (& $hook2Called) { + $hook2Called = true; + }, + function (Page $page) use (& $hook3Called) { + $hook3Called = true; + }, + ]); + + $helper->navigateToPageAndGetRespondedRequest( + new Request('GET', 'https://www.example.com/foo'), + helper_getMinThrottler(), + ); + + expect($hook1Called)->toBeTrue() + ->and($hook2Called)->toBeTrue() + ->and($hook3Called)->toBeTrue(); + + $hook1Called = $hook2Called = $hook3Called = false; + + $helper->navigateToPageAndGetRespondedRequest( + new Request('GET', 'https://www.example.com/foo'), + helper_getMinThrottler(), + ); + + expect($hook1Called)->toBeFalse() + ->and($hook2Called)->toBeFalse() + ->and($hook3Called)->toBeFalse(); +}); diff --git a/tests/_Integration/Http/HeadlessBrowserTest.php b/tests/_Integration/Http/HeadlessBrowserTest.php index a3b1769..927331c 100644 --- a/tests/_Integration/Http/HeadlessBrowserTest.php +++ b/tests/_Integration/Http/HeadlessBrowserTest.php @@ -10,6 +10,7 @@ use Crwlr\Crawler\UserAgents\UserAgent; use Crwlr\Crawler\UserAgents\UserAgentInterface; use Generator; +use HeadlessChromium\Page; use Psr\Log\LoggerInterface; use Symfony\Component\DomCrawler\Crawler; @@ -131,3 +132,30 @@ protected function invoke(mixed $input): Generator ->and($results[0]->get('printed-cookie'))->toBeString() ->and($results[0]->get('printed-cookie'))->toBe('javascriptcookie'); }); + +it('gets a cookie that is set via a click, executed via after navigate hook', function () { + $crawler = new HeadlessBrowserCrawler(); + + $crawler + ->input('http://localhost:8000/set-delayed-js-cookie') + ->addStep( + Http::get() + ->postBrowserNavigateHook(function (Page $page) { + $page->mouse()->find('#setCookieButton')->click(); + }), + ) + ->addStep(new class extends Step { + protected function invoke(mixed $input): Generator + { + yield 'http://localhost:8000/print-cookie'; + } + }) + ->addStep(Http::get()) + ->addStep((new GetStringFromResponseHtmlBody())->keepAs('printed-cookie')); + + $results = helper_generatorToArray($crawler->run()); + + expect($results)->toHaveCount(1) + ->and($results[0]->get('printed-cookie'))->toBeString() + ->and($results[0]->get('printed-cookie'))->toBe('jscookie'); +}); diff --git a/tests/_Integration/Server.php b/tests/_Integration/Server.php index 9105e44..e17e49b 100644 --- a/tests/_Integration/Server.php +++ b/tests/_Integration/Server.php @@ -57,6 +57,10 @@ function getParamAfter(string $route, string $after): string return include(__DIR__ . '/_Server/SetCookieJs.php'); } +if ($route === '/set-delayed-js-cookie') { + return include(__DIR__ . '/_Server/SetDelayedCookieJs.php'); +} + if ($route === '/print-cookie') { return include(__DIR__ . '/_Server/PrintCookie.php'); } diff --git a/tests/_Integration/_Server/SetCookieJs.php b/tests/_Integration/_Server/SetCookieJs.php index 650a958..8b5d611 100644 --- a/tests/_Integration/_Server/SetCookieJs.php +++ b/tests/_Integration/_Server/SetCookieJs.php @@ -1,3 +1,4 @@ + yo diff --git a/tests/_Integration/_Server/SetDelayedCookieJs.php b/tests/_Integration/_Server/SetDelayedCookieJs.php new file mode 100644 index 0000000..8a33973 --- /dev/null +++ b/tests/_Integration/_Server/SetDelayedCookieJs.php @@ -0,0 +1,10 @@ + + + +Hey + +
+ +
+ + From 8251c93a00c84f7f3b7f28c63774ce122fc031c4 Mon Sep 17 00:00:00 2001 From: otsch Date: Fri, 18 Oct 2024 12:52:14 +0200 Subject: [PATCH 2/3] Add BrowserAction class It provides a lot of simple actions to run in the post browser navigate hooks as Closures via static methods. --- CHANGELOG.md | 4 +- .../Loading/Http/Browser/BrowserAction.php | 54 +++++++++++ .../_Integration/Http/HeadlessBrowserTest.php | 95 ++++++++++++++++++- tests/_Integration/Server.php | 22 +++++ .../BrowserActions/ClickAndWaitForReload.php | 16 ++++ .../EvaluateAndWaitForReload.php | 5 + .../EvaluateAndWaitForReloadReloaded.php | 7 ++ .../_Server/BrowserActions/Main.php | 23 +++++ .../_Server/BrowserActions/Wait.php | 18 ++++ 9 files changed, 237 insertions(+), 7 deletions(-) create mode 100644 src/Steps/Loading/Http/Browser/BrowserAction.php create mode 100644 tests/_Integration/_Server/BrowserActions/ClickAndWaitForReload.php create mode 100644 tests/_Integration/_Server/BrowserActions/EvaluateAndWaitForReload.php create mode 100644 tests/_Integration/_Server/BrowserActions/EvaluateAndWaitForReloadReloaded.php create mode 100644 tests/_Integration/_Server/BrowserActions/Main.php create mode 100644 tests/_Integration/_Server/BrowserActions/Wait.php diff --git a/CHANGELOG.md b/CHANGELOG.md index 213b89b..30316c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,9 +6,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -## [2.1.0] - 2024-10-17 +## [2.1.0] - 2024-10-18 ### Added -* The new `postBrowserNavigateHook()` method in the `Http` step classes, which allows to define callback functions that are triggered after the headless browser navigated to the specified URL. They are called with the chrome-php `Page` object as argument. +* The new `postBrowserNavigateHook()` method in the `Http` step classes, which allows to define callback functions that are triggered after the headless browser navigated to the specified URL. They are called with the chrome-php `Page` object as argument, so you can interact with the page. Also, there is a new class `BrowserAction` providing some simple actions (like wait for element, click element,...) as Closures via static methods. You can use it like `Http::get()->postBrowserNavigateHook(BrowserAction::clickElement('#element'))`. ## [2.0.1] - 2024-10-15 ### Fixed diff --git a/src/Steps/Loading/Http/Browser/BrowserAction.php b/src/Steps/Loading/Http/Browser/BrowserAction.php new file mode 100644 index 0000000..79d6ae0 --- /dev/null +++ b/src/Steps/Loading/Http/Browser/BrowserAction.php @@ -0,0 +1,54 @@ +waitUntilContainsElement($cssSelector); + }; + } + + public static function clickElement(string $cssSelector): Closure + { + return function (Page $page) use ($cssSelector) { + $page->mouse()->find($cssSelector)->click(); + }; + } + + public static function clickElementAndWaitForReload(string $cssSelector): Closure + { + return function (Page $page) use ($cssSelector) { + $page->mouse()->find($cssSelector)->click(); + + $page->waitForReload(); + }; + } + + public static function evaluate(string $jsCode): Closure + { + return function (Page $page) use ($jsCode) { + $page->evaluate($jsCode); + }; + } + + public static function evaluateAndWaitForReload(string $jsCode): Closure + { + return function (Page $page) use ($jsCode) { + $page->evaluate($jsCode)->waitForPageReload(); + }; + } + + public static function wait(float $seconds): Closure + { + return function (Page $page) use ($seconds) { + usleep(Microseconds::fromSeconds($seconds)->value); + }; + } +} diff --git a/tests/_Integration/Http/HeadlessBrowserTest.php b/tests/_Integration/Http/HeadlessBrowserTest.php index 927331c..4f8d779 100644 --- a/tests/_Integration/Http/HeadlessBrowserTest.php +++ b/tests/_Integration/Http/HeadlessBrowserTest.php @@ -6,11 +6,11 @@ use Crwlr\Crawler\Loader\LoaderInterface; use Crwlr\Crawler\Steps\Html; use Crwlr\Crawler\Steps\Loading\Http; +use Crwlr\Crawler\Steps\Loading\Http\Browser\BrowserAction; use Crwlr\Crawler\Steps\Step; use Crwlr\Crawler\UserAgents\UserAgent; use Crwlr\Crawler\UserAgents\UserAgentInterface; use Generator; -use HeadlessChromium\Page; use Psr\Log\LoggerInterface; use Symfony\Component\DomCrawler\Crawler; @@ -133,16 +133,14 @@ protected function invoke(mixed $input): Generator ->and($results[0]->get('printed-cookie'))->toBe('javascriptcookie'); }); -it('gets a cookie that is set via a click, executed via after navigate hook', function () { +it('gets a cookie that is set via a click, executed via post browser navigate hook', function () { $crawler = new HeadlessBrowserCrawler(); $crawler ->input('http://localhost:8000/set-delayed-js-cookie') ->addStep( Http::get() - ->postBrowserNavigateHook(function (Page $page) { - $page->mouse()->find('#setCookieButton')->click(); - }), + ->postBrowserNavigateHook(BrowserAction::clickElement('#setCookieButton')), ) ->addStep(new class extends Step { protected function invoke(mixed $input): Generator @@ -159,3 +157,90 @@ protected function invoke(mixed $input): Generator ->and($results[0]->get('printed-cookie'))->toBeString() ->and($results[0]->get('printed-cookie'))->toBe('jscookie'); }); + +test('BrowserActions waitUntilDocumentContainsElement(), clickElement() and evaluate() work as expected', function () { + $crawler = new HeadlessBrowserCrawler(); + + $crawler + ->input('http://localhost:8000/browser-actions') + ->addStep( + Http::get() + ->postBrowserNavigateHook( + BrowserAction::waitUntilDocumentContainsElement('#delayed_el_container #delayed_el'), + ) + ->postBrowserNavigateHook(BrowserAction::clickElement('#click_element')) + ->postBrowserNavigateHook( + BrowserAction::evaluate( + 'document.getElementById(\'evaluation_container\').innerHTML = \'evaluated\'', + ), + ) + ->keep('body'), + ); + + $results = helper_generatorToArray($crawler->run()); + + $body = $results[0]->get('body'); + + expect($body)->toContain('
a
') + ->and($body)->toContain('
yes
') + ->and($body)->toContain('
evaluated
'); +}); + +test('BrowserAction::clickElementAndWaitForReload() works as expected', function () { + $crawler = new HeadlessBrowserCrawler(); + + $crawler + ->input('http://localhost:8000/browser-actions/click-and-wait-for-reload') + ->addStep( + Http::get() + ->postBrowserNavigateHook(BrowserAction::clickElementAndWaitForReload('#click')) + ->keep('body'), + ); + + $results = helper_generatorToArray($crawler->run()); + + $body = $results[0]->get('body'); + + expect($body)->toContain('
yes
'); +}); + +test('BrowserAction::evaluateAndWaitForReload() works as expected', function () { + $crawler = new HeadlessBrowserCrawler(); + + $crawler + ->input('http://localhost:8000/browser-actions/evaluate-and-wait-for-reload') + ->addStep( + Http::get() + ->postBrowserNavigateHook( + BrowserAction::evaluateAndWaitForReload( + 'window.location.href = \'http://localhost:8000/browser-actions/' . + 'evaluate-and-wait-for-reload-reloaded\'', + ), + ) + ->keep('body'), + ); + + $results = helper_generatorToArray($crawler->run()); + + $body = $results[0]->get('body'); + + expect($body)->toContain('
yay
'); +}); + +test('BrowserAction::wait() works as expected', function () { + $crawler = new HeadlessBrowserCrawler(); + + $crawler + ->input('http://localhost:8000/browser-actions/wait') + ->addStep( + Http::get() + ->postBrowserNavigateHook(BrowserAction::wait(0.3)) + ->keep('body'), + ); + + $results = helper_generatorToArray($crawler->run()); + + $body = $results[0]->get('body'); + + expect($body)->toContain('
hooray
'); +}); diff --git a/tests/_Integration/Server.php b/tests/_Integration/Server.php index e17e49b..494080d 100644 --- a/tests/_Integration/Server.php +++ b/tests/_Integration/Server.php @@ -61,6 +61,28 @@ function getParamAfter(string $route, string $after): string return include(__DIR__ . '/_Server/SetDelayedCookieJs.php'); } +if (str_starts_with($route, '/browser-actions')) { + if ($route === '/browser-actions') { + return include(__DIR__ . '/_Server/BrowserActions/Main.php'); + } + + if (str_starts_with($route, '/browser-actions/click-and-wait-for-reload')) { + return include(__DIR__ . '/_Server/BrowserActions/ClickAndWaitForReload.php'); + } + + if ($route === '/browser-actions/evaluate-and-wait-for-reload') { + return include(__DIR__ . '/_Server/BrowserActions/EvaluateAndWaitForReload.php'); + } + + if ($route === '/browser-actions/evaluate-and-wait-for-reload-reloaded') { + return include(__DIR__ . '/_Server/BrowserActions/EvaluateAndWaitForReloadReloaded.php'); + } + + if ($route === '/browser-actions/wait') { + return include(__DIR__ . '/_Server/BrowserActions/Wait.php'); + } +} + if ($route === '/print-cookie') { return include(__DIR__ . '/_Server/PrintCookie.php'); } diff --git a/tests/_Integration/_Server/BrowserActions/ClickAndWaitForReload.php b/tests/_Integration/_Server/BrowserActions/ClickAndWaitForReload.php new file mode 100644 index 0000000..5070f9b --- /dev/null +++ b/tests/_Integration/_Server/BrowserActions/ClickAndWaitForReload.php @@ -0,0 +1,16 @@ + + + + + Hello World + + +
+ Click here + + +
yes
+ +
+ + diff --git a/tests/_Integration/_Server/BrowserActions/EvaluateAndWaitForReload.php b/tests/_Integration/_Server/BrowserActions/EvaluateAndWaitForReload.php new file mode 100644 index 0000000..c29b008 --- /dev/null +++ b/tests/_Integration/_Server/BrowserActions/EvaluateAndWaitForReload.php @@ -0,0 +1,5 @@ + + +Hello World + + diff --git a/tests/_Integration/_Server/BrowserActions/EvaluateAndWaitForReloadReloaded.php b/tests/_Integration/_Server/BrowserActions/EvaluateAndWaitForReloadReloaded.php new file mode 100644 index 0000000..30b9ff2 --- /dev/null +++ b/tests/_Integration/_Server/BrowserActions/EvaluateAndWaitForReloadReloaded.php @@ -0,0 +1,7 @@ + + +Hello World + +
yay
+ + diff --git a/tests/_Integration/_Server/BrowserActions/Main.php b/tests/_Integration/_Server/BrowserActions/Main.php new file mode 100644 index 0000000..bc1e70a --- /dev/null +++ b/tests/_Integration/_Server/BrowserActions/Main.php @@ -0,0 +1,23 @@ + + + + + Hello World + + +
+
+ +
+
Click me
+ +
+ + +
+ + diff --git a/tests/_Integration/_Server/BrowserActions/Wait.php b/tests/_Integration/_Server/BrowserActions/Wait.php new file mode 100644 index 0000000..16ac9cf --- /dev/null +++ b/tests/_Integration/_Server/BrowserActions/Wait.php @@ -0,0 +1,18 @@ + + + + + Hello World + + +
+
+ + +
+ + From 2a878f368bffd712ea54f993e83c07cc4c9071df Mon Sep 17 00:00:00 2001 From: otsch Date: Sat, 19 Oct 2024 00:30:49 +0200 Subject: [PATCH 3/3] Update changelog date --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 30316c3..494c624 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -## [2.1.0] - 2024-10-18 +## [2.1.0] - 2024-10-19 ### Added * The new `postBrowserNavigateHook()` method in the `Http` step classes, which allows to define callback functions that are triggered after the headless browser navigated to the specified URL. They are called with the chrome-php `Page` object as argument, so you can interact with the page. Also, there is a new class `BrowserAction` providing some simple actions (like wait for element, click element,...) as Closures via static methods. You can use it like `Http::get()->postBrowserNavigateHook(BrowserAction::clickElement('#element'))`.