Skip to content

Commit

Permalink
Post Browser Navigate Hooks
Browse files Browse the repository at this point in the history
Adds post browser navigate hooks that are passed from `Http` steps to
the browser loader helper and are executed after the headless browser
navigated to the specified URL. They are called with the chrome-php
`Page` object as argument, so you can run things on that page before
getting the HTML and returning the response.
  • Loading branch information
otsch committed Oct 17, 2024
1 parent 52dd36b commit ec310ca
Show file tree
Hide file tree
Showing 8 changed files with 142 additions and 2 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [2.1.0] - 2024-10-17
### Added
* The new `postBrowserNavigateHook()` method in the `Http` step classes, which allows to define callback functions that are triggered after the headless browser navigated to the specified URL. They are called with the chrome-php `Page` object as argument.

## [2.0.1] - 2024-10-15
### Fixed
* Issue with the `afterLoad` hook of the `HttpLoader`, introduced in v2. Calling the hook was commented out, which slipped through because the test case was faulty.
Expand Down
34 changes: 34 additions & 0 deletions src/Loader/Http/HeadlessBrowserLoaderHelper.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace Crwlr\Crawler\Loader\Http;

use Closure;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Loader\Http\Politeness\Throttler;
use Exception;
Expand Down Expand Up @@ -42,8 +43,28 @@ class HeadlessBrowserLoaderHelper

protected int $timeout = 30_000;

/**
* @var Closure[]
*/
protected array $tempPostNavigateHooks = [];

public function __construct(private ?BrowserFactory $browserFactory = null) {}

/**
* Set temporary post navigate hooks
*
* They will be executed after the next call to navigateToPageAndGetRespondedRequest()
* and forgotten afterward.
*
* @param Closure[] $hooks
*/
public function setTempPostNavigateHooks(array $hooks): static
{
$this->tempPostNavigateHooks = $hooks;

return $this;
}

/**
* @throws OperationTimedOut
* @throws CommunicationException
Expand Down Expand Up @@ -81,6 +102,8 @@ function ($params) use (&$statusCode, &$responseHeaders) {

$this->navigate($request->getUri()->__toString());

$this->callPostNavigateHooks();

$throttler->trackRequestEndFor($request->getUri());

$html = $this->page?->getHtml();
Expand Down Expand Up @@ -196,6 +219,17 @@ protected function navigate(string $url): void
}
}

protected function callPostNavigateHooks(): void
{
if (!empty($this->tempPostNavigateHooks)) {
foreach ($this->tempPostNavigateHooks as $hook) {
$hook->call($this, $this->page);
}
}

$this->tempPostNavigateHooks = [];
}

/**
* @throws Exception
*/
Expand Down
23 changes: 21 additions & 2 deletions src/Steps/Loading/HttpBase.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace Crwlr\Crawler\Steps\Loading;

use Closure;
use Crwlr\Crawler\Loader\Http\Exceptions\LoadingException;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
Expand Down Expand Up @@ -42,6 +43,11 @@ abstract class HttpBase extends Step
*/
protected ?array $inputHeaders = null;

/**
* @var Closure[]
*/
protected array $postBrowserNavigateHooks = [];

/**
* @param string $method
* @param array<string, string|string[]> $headers
Expand Down Expand Up @@ -127,6 +133,13 @@ public function useInputKeyAsHeaders(string $key): static
return $this;
}

public function postBrowserNavigateHook(Closure $callback): static
{
$this->postBrowserNavigateHooks[] = $callback;

return $this;
}

/**
* @return UriInterface|UriInterface[]
* @throws InvalidArgumentException
Expand Down Expand Up @@ -185,10 +198,16 @@ protected function getRequestFromInputUri(UriInterface $uri): RequestInterface
*/
protected function getResponseFromRequest(RequestInterface $request): ?RespondedRequest
{
$loader = $this->getLoader();

if (!empty($this->postBrowserNavigateHooks) && $loader->usesHeadlessBrowser()) {
$loader->browser()->setTempPostNavigateHooks($this->postBrowserNavigateHooks);
}

if ($this->stopOnErrorResponse) {
$response = $this->getLoader()->loadOrFail($request);
$response = $loader->loadOrFail($request);
} else {
$response = $this->getLoader()->load($request);
$response = $loader->load($request);
}

if ($response !== null && ($response->response->getStatusCode() < 400 || $this->yieldErrorResponses)) {
Expand Down
40 changes: 40 additions & 0 deletions tests/Loader/Http/HeadlessBrowserLoaderHelperTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,43 @@ function helper_setUpHeadlessChromeMocks(

expect($invadedBrowserFactory->chromeBinary)->toBe($chromeExecutable);
});

it('calls the temporary post navigate hooks once', function () {
$browserFactoryMock = helper_setUpHeadlessChromeMocks();

$helper = new HeadlessBrowserLoaderHelper($browserFactoryMock);

$hook1Called = $hook2Called = $hook3Called = false;

$helper->setTempPostNavigateHooks([
function (Page $page) use (& $hook1Called) {
$hook1Called = true;
},
function (Page $page) use (& $hook2Called) {
$hook2Called = true;
},
function (Page $page) use (& $hook3Called) {
$hook3Called = true;
},
]);

$helper->navigateToPageAndGetRespondedRequest(
new Request('GET', 'https://www.example.com/foo'),
helper_getMinThrottler(),
);

expect($hook1Called)->toBeTrue()
->and($hook2Called)->toBeTrue()
->and($hook3Called)->toBeTrue();

$hook1Called = $hook2Called = $hook3Called = false;

$helper->navigateToPageAndGetRespondedRequest(
new Request('GET', 'https://www.example.com/foo'),
helper_getMinThrottler(),
);

expect($hook1Called)->toBeFalse()
->and($hook2Called)->toBeFalse()
->and($hook3Called)->toBeFalse();
});
28 changes: 28 additions & 0 deletions tests/_Integration/Http/HeadlessBrowserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Generator;
use HeadlessChromium\Page;
use Psr\Log\LoggerInterface;
use Symfony\Component\DomCrawler\Crawler;

Expand Down Expand Up @@ -131,3 +132,30 @@ protected function invoke(mixed $input): Generator
->and($results[0]->get('printed-cookie'))->toBeString()
->and($results[0]->get('printed-cookie'))->toBe('javascriptcookie');
});

it('gets a cookie that is set via a click, executed via after navigate hook', function () {
$crawler = new HeadlessBrowserCrawler();

$crawler
->input('http://localhost:8000/set-delayed-js-cookie')
->addStep(
Http::get()
->postBrowserNavigateHook(function (Page $page) {
$page->mouse()->find('#setCookieButton')->click();
}),
)
->addStep(new class extends Step {
protected function invoke(mixed $input): Generator
{
yield 'http://localhost:8000/print-cookie';
}
})
->addStep(Http::get())
->addStep((new GetStringFromResponseHtmlBody())->keepAs('printed-cookie'));

$results = helper_generatorToArray($crawler->run());

expect($results)->toHaveCount(1)
->and($results[0]->get('printed-cookie'))->toBeString()
->and($results[0]->get('printed-cookie'))->toBe('jscookie');
});
4 changes: 4 additions & 0 deletions tests/_Integration/Server.php
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ function getParamAfter(string $route, string $after): string
return include(__DIR__ . '/_Server/SetCookieJs.php');
}

if ($route === '/set-delayed-js-cookie') {
return include(__DIR__ . '/_Server/SetDelayedCookieJs.php');
}

if ($route === '/print-cookie') {
return include(__DIR__ . '/_Server/PrintCookie.php');
}
Expand Down
1 change: 1 addition & 0 deletions tests/_Integration/_Server/SetCookieJs.php
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
<?php setcookie('testcookie', 'foo123'); ?>
<!doctype html>
<html lang="de">
<head><meta charset=utf-8><title>yo</title></head>
Expand Down
10 changes: 10 additions & 0 deletions tests/_Integration/_Server/SetDelayedCookieJs.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<?php setcookie('testcookie', 'foo123'); ?>
<!doctype html>
<html lang="de">
<head><meta charset=utf-8><title>Hey</title></head>
<body>
<div>
<button id="setCookieButton" onclick="document.cookie = 'testcookie=jscookie'"></button>
</div>
</body>
</html>

0 comments on commit ec310ca

Please sign in to comment.