Skip to content

Commit

Permalink
Add browser cookies to the cookie jar
Browse files Browse the repository at this point in the history
Also add cookies, set during headless browser usage, to the cookie jar.
When switching back to the (guzzle) HTTP client the cookies should also
be sent.
  • Loading branch information
otsch committed Oct 20, 2024
1 parent 1b9b4d2 commit f881919
Show file tree
Hide file tree
Showing 9 changed files with 164 additions and 36 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [2.1.1] - 2024-10-20
### Fixed
* Also add cookies, set during headless browser usage, to the cookie jar. When switching back to the (guzzle) HTTP client the cookies should also be sent.

## [2.1.0] - 2024-10-19
### Added
* The new `postBrowserNavigateHook()` method in the `Http` step classes, which allows to define callback functions that are triggered after the headless browser navigated to the specified URL. They are called with the chrome-php `Page` object as argument, so you can interact with the page. Also, there is a new class `BrowserAction` providing some simple actions (like wait for element, click element,...) as Closures via static methods. You can use it like `Http::get()->postBrowserNavigateHook(BrowserAction::clickElement('#element'))`.
Expand Down
69 changes: 63 additions & 6 deletions src/Loader/Http/Cookies/CookieJar.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
use Crwlr\Crawler\Loader\Http\Cookies\Exceptions\InvalidCookieException;
use Crwlr\Url\Url;
use Exception;
use HeadlessChromium\Cookies\CookiesCollection;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;

Expand Down Expand Up @@ -37,17 +38,38 @@ public function flush(): void
* @throws InvalidCookieException
* @throws Exception
*/
public function addFrom(string|UriInterface|Url $url, ResponseInterface $response): void
public function addFrom(string|UriInterface|Url $url, ResponseInterface|CookiesCollection $response): void
{
$cookieHeaders = $response->getHeader('set-cookie');
if ($response instanceof CookiesCollection) {
$this->addFromBrowserCookieCollection($url, $response);
} else {
$cookieHeaders = $response->getHeader('set-cookie');

if (!empty($cookieHeaders)) {
if (!empty($cookieHeaders)) {
$url = !$url instanceof Url ? Url::parse($url) : $url;

foreach ($cookieHeaders as $cookieHeader) {
$cookie = new Cookie($url, $cookieHeader);

$this->jar[$cookie->domain()][$cookie->name()] = $cookie;
}
}
}
}

/**
* @throws InvalidCookieException
* @throws Exception
*/
public function addFromBrowserCookieCollection(string|UriInterface|Url $url, CookiesCollection $collection): void
{
if ($collection->count() > 0) {
$url = !$url instanceof Url ? Url::parse($url) : $url;

foreach ($cookieHeaders as $cookieHeader) {
$cookie = new Cookie($url, $cookieHeader);
foreach ($collection as $cookie) {
$cookie = new Cookie($url, $this->buildSetCookieHeaderFromBrowserCookie($cookie));

$this->jar[$this->getForDomainFromUrl($url)][$cookie->name()] = $cookie;
$this->jar[$cookie->domain()][$cookie->name()] = $cookie;
}
}
}
Expand Down Expand Up @@ -92,4 +114,39 @@ protected function getForDomainFromUrl(string|UriInterface|Url $url): ?string

return $forDomain;
}

protected function buildSetCookieHeaderFromBrowserCookie(\HeadlessChromium\Cookies\Cookie $cookie): string
{
$header = $cookie->getName() . '=' . $cookie->getValue();

if ($cookie->getDomain() !== null) {
$header .= '; Domain=' . $cookie->getDomain();
}

if ($cookie->offsetExists('expires') && $cookie->offsetGet('expires') !== -1) {
$header .= '; Expires=' . $cookie->offsetGet('expires');
}

if ($cookie->offsetExists('max-age') && !empty($cookie->offsetGet('path'))) {
$header .= '; Max-Age=' . $cookie->offsetGet('max-age');
}

if ($cookie->offsetExists('path') && !empty($cookie->offsetGet('path'))) {
$header .= '; Path=' . $cookie->offsetGet('path');
}

if ($cookie->offsetExists('secure') && !empty($cookie->offsetGet('secure'))) {
$header .= '; Secure=' . $cookie->offsetGet('path');
}

if ($cookie->offsetExists('httpOnly') && $cookie->offsetGet('httpOnly') === true) {
$header .= '; HttpOnly';
}

if ($cookie->offsetExists('sameSite') && !empty($cookie->offsetGet('sameSite'))) {
$header .= '; SameSite=' . $cookie->offsetGet('sameSite');
}

return $header;
}
}
29 changes: 27 additions & 2 deletions src/Loader/Http/HeadlessBrowserLoaderHelper.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
namespace Crwlr\Crawler\Loader\Http;

use Closure;
use Crwlr\Crawler\Loader\Http\Cookies\CookieJar;
use Crwlr\Crawler\Loader\Http\Cookies\Exceptions\InvalidCookieException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Loader\Http\Politeness\Throttler;
use Exception;
Expand All @@ -19,6 +21,7 @@
use HeadlessChromium\Exception\OperationTimedOut;
use HeadlessChromium\Page;
use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\UriInterface;

class HeadlessBrowserLoaderHelper
{
Expand Down Expand Up @@ -80,6 +83,7 @@ public function navigateToPageAndGetRespondedRequest(
RequestInterface $request,
Throttler $throttler,
?string $proxy = null,
?CookieJar $cookieJar = null,
): RespondedRequest {
$browser = $this->getBrowser($request, $proxy);

Expand All @@ -102,12 +106,14 @@ function ($params) use (&$statusCode, &$responseHeaders) {

$this->navigate($request->getUri()->__toString());

$this->callPostNavigateHooks();

$throttler->trackRequestEndFor($request->getUri());

$this->callPostNavigateHooks();

$html = $this->page?->getHtml();

$this->addCookiesToJar($cookieJar, $request->getUri());

return new RespondedRequest(
$request,
new Response($statusCode, $responseHeaders, $html),
Expand Down Expand Up @@ -230,6 +236,25 @@ protected function callPostNavigateHooks(): void
$this->tempPostNavigateHooks = [];
}

/**
* @throws CommunicationException
* @throws OperationTimedOut
* @throws NoResponseAvailable
* @throws InvalidCookieException
*/
protected function addCookiesToJar(?CookieJar $cookieJar, UriInterface $requestUrl): void
{
if (!$cookieJar) {
return;
}

$cookies = $this->page?->getCookies();

if ($cookies) {
$cookieJar->addFrom($requestUrl, $cookies);
}
}

/**
* @throws Exception
*/
Expand Down
9 changes: 7 additions & 2 deletions src/Loader/Http/HttpLoader.php
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,12 @@ protected function loadViaClientOrHeadlessBrowser(RequestInterface $request): Re
if ($this->useHeadlessBrowser) {
$proxy = $this->proxies?->getProxy() ?? null;

return $this->browser()->navigateToPageAndGetRespondedRequest($request, $this->throttler, $proxy);
return $this->browser()->navigateToPageAndGetRespondedRequest(
$request,
$this->throttler,
$proxy,
$this->cookieJar,
);
}

return $this->handleRedirects($request);
Expand Down Expand Up @@ -626,7 +631,7 @@ protected function addCookiesToJar(RespondedRequest $respondedRequest): void
*/
protected function addCookiesToRequest(RequestInterface $request): RequestInterface
{
if (!$this->useCookies) {
if (!$this->useCookies || $this->usesHeadlessBrowser()) {
return $request;
}

Expand Down
15 changes: 9 additions & 6 deletions tests/Loader/Http/HttpLoaderPolitenessTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
use GuzzleHttp\Psr7\Response;
use HeadlessChromium\Browser;
use HeadlessChromium\Communication\Session;
use HeadlessChromium\Cookies\CookiesCollection;
use HeadlessChromium\Page;
use HeadlessChromium\PageUtils\PageNavigation;
use Mockery;
Expand Down Expand Up @@ -54,9 +55,8 @@ function helper_wait300ms(): void

$diff = $secondResponse - $firstResponse;

expect($diff)->toBeGreaterThan(0.3);

expect($diff)->toBeLessThan(0.62);
expect($diff)->toBeGreaterThan(0.3)
->and($diff)->toBeLessThan(0.62);
})->with(['load', 'loadOrFail']);

it('also throttles requests using the headless browser', function ($loadingMethod) {
Expand All @@ -83,6 +83,8 @@ function helper_wait300ms(): void
return $pageNavigationMock;
});

$pageMock->shouldReceive('getCookies')->andReturn(new CookiesCollection());

$pageMock->shouldReceive('getHtml')->andReturn('<html>foo</html>');

$browserMock->shouldReceive('createPage')->andReturn($pageMock);
Expand All @@ -104,6 +106,8 @@ function helper_wait300ms(): void

$pageMock->shouldReceive('navigate')->andReturn($pageNavigationMock);

$pageMock->shouldReceive('getCookies')->andReturn(new CookiesCollection());

$firstResponse = microtime(true);

$loader->{$loadingMethod}('https://www.example.com/bar');
Expand All @@ -112,9 +116,8 @@ function helper_wait300ms(): void

$diff = $secondResponse - $firstResponse;

expect($diff)->toBeGreaterThan(0.3);

expect($diff)->toBeLessThan(0.62);
expect($diff)->toBeGreaterThan(0.3)
->and($diff)->toBeLessThan(0.62);
})->with(['load', 'loadOrFail']);

it('does not throttle requests to different domains', function ($loadingMethod) {
Expand Down
51 changes: 35 additions & 16 deletions tests/_Integration/Http/HeadlessBrowserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
namespace tests\_Integration\Http;

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\Http\Cookies\Cookie;
use Crwlr\Crawler\Loader\Http\Cookies\CookieJar;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Steps\Html;
use Crwlr\Crawler\Steps\Loading\Http;
Expand Down Expand Up @@ -56,6 +59,18 @@ protected function invoke(mixed $input): Generator
}
}

/**
* @return Cookie[]
*/
function helper_getCookiesByDomainFromLoader(HttpLoader $loader, string $domain): array
{
$cookieJar = invade($loader)->cookieJar;

/** @var CookieJar $cookieJar */

return $cookieJar->allByDomain($domain);
}

it('automatically uses the Loader\'s user agent', function () {
$crawler = new HeadlessBrowserCrawler();

Expand Down Expand Up @@ -111,26 +126,22 @@ protected function invoke(mixed $input): Generator
]);
});

it('also gets cookies that are set via javascript', function () {
it('gets cookies that are set via javascript', function () {
$crawler = new HeadlessBrowserCrawler();

$crawler
->input('http://localhost:8000/set-js-cookie')
->addStep(Http::get())
->addStep(new class extends Step {
protected function invoke(mixed $input): Generator
{
yield 'http://localhost:8000/print-cookie';
}
})
->addStep(Http::get())
->addStep((new GetStringFromResponseHtmlBody())->keepAs('printed-cookie'));
->addStep(Http::get());

$results = helper_generatorToArray($crawler->run());
helper_generatorToArray($crawler->run());

expect($results)->toHaveCount(1)
->and($results[0]->get('printed-cookie'))->toBeString()
->and($results[0]->get('printed-cookie'))->toBe('javascriptcookie');
$cookiesInJar = helper_getCookiesByDomainFromLoader($crawler->getLoader(), 'localhost');

$testCookie = $cookiesInJar['testcookie'] ?? null;

expect($cookiesInJar)->toHaveCount(1)
->and($testCookie?->name())->toBe('testcookie')
->and($testCookie?->value())->toBe('javascriptcookie');
});

it('gets a cookie that is set via a click, executed via post browser navigate hook', function () {
Expand All @@ -140,7 +151,7 @@ protected function invoke(mixed $input): Generator
->input('http://localhost:8000/set-delayed-js-cookie')
->addStep(
Http::get()
->postBrowserNavigateHook(BrowserAction::clickElement('#setCookieButton')),
->postBrowserNavigateHook(BrowserAction::clickElement('#consent_btn')),
)
->addStep(new class extends Step {
protected function invoke(mixed $input): Generator
Expand All @@ -155,7 +166,15 @@ protected function invoke(mixed $input): Generator

expect($results)->toHaveCount(1)
->and($results[0]->get('printed-cookie'))->toBeString()
->and($results[0]->get('printed-cookie'))->toBe('jscookie');
->and($results[0]->get('printed-cookie'))->toBe('javascriptcookie');

$cookiesInJar = helper_getCookiesByDomainFromLoader($crawler->getLoader(), 'localhost');

$testCookie = $cookiesInJar['testcookie'] ?? null;

expect($cookiesInJar)->toHaveCount(1)
->and($testCookie?->name())->toBe('testcookie')
->and($testCookie?->value())->toBe('javascriptcookie');
});

test('BrowserActions waitUntilDocumentContainsElement(), clickElement() and evaluate() work as expected', function () {
Expand Down
12 changes: 12 additions & 0 deletions tests/_Integration/Server.php
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,18 @@ function getParamAfter(string $route, string $after): string
return include(__DIR__ . '/_Server/SetCookieJs.php');
}

if ($route === '/scripts/set-cookie.js') {
echo <<<JS
document.addEventListener("DOMContentLoaded", function () {
document.getElementById('consent_btn').addEventListener('click', function (ev) {
ev.preventDefault();
document.cookie = "testcookie=javascriptcookie";
}, false);
}, false);
JS;
return;
}

if ($route === '/set-delayed-js-cookie') {
return include(__DIR__ . '/_Server/SetDelayedCookieJs.php');
}
Expand Down
1 change: 0 additions & 1 deletion tests/_Integration/_Server/SetCookieJs.php
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
<?php setcookie('testcookie', 'foo123'); ?>
<!doctype html>
<html lang="de">
<head><meta charset=utf-8><title>yo</title></head>
Expand Down
10 changes: 7 additions & 3 deletions tests/_Integration/_Server/SetDelayedCookieJs.php
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
<?php setcookie('testcookie', 'foo123'); ?>
<!doctype html>
<html lang="de">
<head><meta charset=utf-8><title>Hey</title></head>
<head>
<meta charset=utf-8><title>Hey</title>
<script src="/scripts/set-cookie.js"></script>
</head>
<body>
<div>
<button id="setCookieButton" onclick="document.cookie = 'testcookie=jscookie'"></button>
<button type="button" id="consent_btn">
Accept Cookie
</button>
</div>
</body>
</html>

0 comments on commit f881919

Please sign in to comment.