Skip to content

Commit

Permalink
Add option to use headless browser with HttpLoader
Browse files Browse the repository at this point in the history
Add chrome-php/chrome composer dependency and simple option to use a
headless chrome browser with the HttpLoader. So this is enough to get
HTML after executing javascript in the browser. For more sophisticated
tasks better not extend the HttpLoader further, but instead maybe
create a separate Loader and/or Steps.
  • Loading branch information
otsch committed Aug 25, 2022
1 parent 8f6a20f commit 9cf42d7
Show file tree
Hide file tree
Showing 10 changed files with 346 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]
### Added
* You can now call the new `useHeadlessBrowser` method on the `HttpLoader` class to use a headless Chrome browser to load pages. This is enough to get HTML after executing javascript in the browser. For more sophisticated tasks a separate Loader and/or Steps should better be created.
* With the `maxOutputs()` method of the abstract `Step` class you can now limit how many outputs a certain step should yield at max. That's for example helpful during development, when you want to run the crawler only with a small subset of the data/requests it will actually have to process when you eventually remove the limits. When a step has reached its limit, it won't even call the `invoke()` method any longer until the step is reset after a run.
* With the new `outputHook()` method of the abstract `Crawler` class you can set a closure that'll receive all the outputs from all the steps. Should be only for debugging reasons.
* The `extract()` method of the `Html` and `Xml` (children of `Dom`) steps now also works with a single selector instead of an array with a mapping. Sometimes you'll want to just get a simple string output e.g. for a next step, instead of an array with mapped extracted data.
Expand Down
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
"symfony/css-selector": "^6.0",
"psr/simple-cache": "^1.0|^2.0|^3.0",
"guzzlehttp/guzzle": "^7.4",
"adbario/php-dot-notation": "^3.1"
"adbario/php-dot-notation": "^3.1",
"chrome-php/chrome": "^1.6"
},
"require-dev": {
"pestphp/pest": "^1.21",
Expand Down
190 changes: 186 additions & 4 deletions src/Loader/Http/HttpLoader.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@
use Exception;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use HeadlessChromium\Browser;
use HeadlessChromium\BrowserFactory;
use HeadlessChromium\Exception\CommunicationException;
use HeadlessChromium\Exception\NavigationExpired;
use HeadlessChromium\Exception\NoResponseAvailable;
use HeadlessChromium\Exception\OperationTimedOut;
use InvalidArgumentException;
use Psr\Http\Client\ClientExceptionInterface;
use Psr\Http\Client\ClientInterface;
Expand All @@ -24,9 +31,24 @@
class HttpLoader extends Loader
{
protected ClientInterface $httpClient;

protected CookieJar $cookieJar;

protected bool $useCookies = true;

protected bool $useHeadlessBrowser = false;

/**
* @var mixed[]
*/
protected array $headlessBrowserOptions = [
'windowSize' => [1920, 1000],
];

protected bool $headlessBrowserOptionsDirty = false;

protected ?Browser $headlessBrowser = null;

public function __construct(
UserAgentInterface $userAgent,
?ClientInterface $httpClient = null,
Expand Down Expand Up @@ -65,14 +87,16 @@ public function load(mixed $subject): ?RespondedRequest
}

$request = $this->prepareRequest($request);

$this->callHook('beforeLoad', $request);

try {
$respondedRequest = $this->getFromCache($request);

$isFromCache = $respondedRequest !== null;

if (!$respondedRequest) {
$respondedRequest = $this->handleRedirects($request);
$respondedRequest = $this->loadViaClientOrHeadlessBrowser($request);
}

if ($respondedRequest->response->getStatusCode() < 400) {
Expand All @@ -83,12 +107,14 @@ public function load(mixed $subject): ?RespondedRequest

if (!$isFromCache && $this->cache) {
$responseCacheItem = HttpResponseCacheItem::fromAggregate($respondedRequest);

$this->cache->set($responseCacheItem->key(), $responseCacheItem);
}

return $respondedRequest;
} catch (Throwable $exception) {
$this->trackRequestEnd(); // Don't move to finally so hooks don't run before it.

$this->callHook('onError', $request, $exception);

return null;
Expand All @@ -99,31 +125,46 @@ public function load(mixed $subject): ?RespondedRequest

/**
* @throws ClientExceptionInterface
* @throws CommunicationException
* @throws CommunicationException\CannotReadResponse
* @throws CommunicationException\InvalidResponse
* @throws CommunicationException\ResponseHasError
* @throws LoadingException
* @throws NavigationExpired
* @throws NoResponseAvailable
* @throws OperationTimedOut
* @throws Throwable
* @throws \Psr\SimpleCache\InvalidArgumentException
*/
public function loadOrFail(mixed $subject): RespondedRequest
{
$request = $this->validateSubjectType($subject);

$this->isAllowedToBeLoaded($request->getUri(), true);

$request = $this->prepareRequest($request);

$this->callHook('beforeLoad', $request);

$respondedRequest = $this->getFromCache($request);

$isFromCache = $respondedRequest !== null;

if (!$respondedRequest) {
$respondedRequest = $this->handleRedirects($request);
$respondedRequest = $this->loadViaClientOrHeadlessBrowser($request);
}

if ($respondedRequest->response->getStatusCode() >= 400) {
throw new LoadingException('Failed to load ' . $request->getUri()->__toString());
}

$this->callHook('onSuccess', $request, $respondedRequest->response);

$this->callHook('afterLoad', $request);

if (!$isFromCache && $this->cache) {
$responseCacheItem = HttpResponseCacheItem::fromAggregate($respondedRequest);

$this->cache->set($responseCacheItem->key(), $responseCacheItem);
}

Expand All @@ -142,6 +183,48 @@ public function flushCookies(): void
$this->cookieJar->flush();
}

public function useHeadlessBrowser(): static
{
$this->useHeadlessBrowser = true;

return $this;
}

public function useHttpClient(): static
{
$this->useHeadlessBrowser = false;

$this->headlessBrowser = null;

return $this;
}

/**
* @param mixed[] $options
*/
public function setHeadlessBrowserOptions(array $options): static
{
$this->headlessBrowserOptions = $options;

$this->headlessBrowserOptionsDirty = true;

return $this;
}

/**
* @param mixed[] $options
*/
public function addHeadlessBrowserOptions(array $options): static
{
foreach ($options as $key => $value) {
$this->headlessBrowserOptions[$key] = $value;
}

$this->headlessBrowserOptionsDirty = true;

return $this;
}

/**
* @throws \Psr\SimpleCache\InvalidArgumentException
*/
Expand All @@ -155,6 +238,7 @@ protected function getFromCache(RequestInterface $request): ?RespondedRequest

if ($this->cache->has($key)) {
$this->logger->info('Found ' . $request->getUri()->__toString() . ' in cache.');

$responseCacheItem = $this->cache->get($key);

return $responseCacheItem->aggregate();
Expand All @@ -179,9 +263,30 @@ protected function validateSubjectType(RequestInterface|string $requestOrUri): R
protected function prepareRequest(RequestInterface $request): RequestInterface
{
$request = $request->withHeader('User-Agent', $this->userAgent->__toString());
$request = $this->addCookiesToRequest($request);

return $request;
return $this->addCookiesToRequest($request);
}

/**
* @param RequestInterface $request
* @return RespondedRequest
* @throws ClientExceptionInterface
* @throws CommunicationException
* @throws CommunicationException\CannotReadResponse
* @throws CommunicationException\InvalidResponse
* @throws CommunicationException\ResponseHasError
* @throws NavigationExpired
* @throws NoResponseAvailable
* @throws OperationTimedOut
* @throws Throwable
*/
private function loadViaClientOrHeadlessBrowser(RequestInterface $request): RespondedRequest
{
if ($this->useHeadlessBrowser) {
return $this->loadViaHeadlessBrowser($request);
}

return $this->handleRedirects($request);
}

/**
Expand All @@ -192,7 +297,9 @@ private function handleRedirects(
?RespondedRequest $aggregate = null
): RespondedRequest {
$this->trackRequestStart();

$response = $this->httpClient->sendRequest($request);

$this->trackRequestEnd();

if (!$aggregate) {
Expand All @@ -205,6 +312,7 @@ private function handleRedirects(

if ($aggregate->isRedirect()) {
$this->logger()->info('Load redirect to: ' . $aggregate->effectiveUri());

$newRequest = $request->withUri(Url::parsePsr7($aggregate->effectiveUri()));

return $this->handleRedirects($newRequest, $aggregate);
Expand All @@ -213,6 +321,67 @@ private function handleRedirects(
return $aggregate;
}

/**
* @param RequestInterface $request
* @return RespondedRequest
* @throws CommunicationException
* @throws CommunicationException\CannotReadResponse
* @throws CommunicationException\InvalidResponse
* @throws CommunicationException\ResponseHasError
* @throws NavigationExpired
* @throws NoResponseAvailable
* @throws OperationTimedOut
* @throws Throwable
*/
private function loadViaHeadlessBrowser(RequestInterface $request): RespondedRequest
{
$browser = $this->getBrowser($request);

$page = $browser->createPage();

$statusCode = 500;

$responseHeaders = [];

$page->getSession()->once(
"method:Network.responseReceived",
function ($params) use (& $statusCode, & $responseHeaders) {
$statusCode = $params['response']['status'];

$responseHeaders = $this->sanitizeResponseHeaders($params['response']['headers']);
}
);

$page->navigate($request->getUri()->__toString())
->waitForNavigation();

$html = $page->getHtml();

return new RespondedRequest(
$request,
new Response($statusCode, $responseHeaders, $html)
);
}

private function getBrowser(RequestInterface $request): Browser
{
if (!$this->headlessBrowser || $this->headlessBrowserOptionsDirty) {
$this->headlessBrowser?->close();

$options = $this->headlessBrowserOptions;

$options['userAgent'] = $this->userAgent->__toString();

$options['headers'] = array_merge($options['headers'] ?? [], $request->getHeaders());

$this->headlessBrowser = (new BrowserFactory())->createBrowser($options);

$this->headlessBrowserOptionsDirty = false;
}

return $this->headlessBrowser;
}

private function addCookiesToJar(RespondedRequest $aggregate): void
{
if ($this->useCookies) {
Expand All @@ -236,4 +405,17 @@ private function addCookiesToRequest(RequestInterface $request): RequestInterfac

return $request;
}

/**
* @param string[] $headers
* @return string[]
*/
private function sanitizeResponseHeaders(array $headers): array
{
foreach ($headers as $key => $value) {
$headers[$key] = explode(PHP_EOL, $value)[0];
}

return $headers;
}
}
1 change: 1 addition & 0 deletions src/Loader/Loader.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
abstract class Loader implements LoaderInterface
{
protected LoggerInterface $logger;

protected ?CacheInterface $cache = null;

/**
Expand Down
Loading

0 comments on commit 9cf42d7

Please sign in to comment.