diff --git a/CHANGELOG.md b/CHANGELOG.md index f935566a..bc3b4b7c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +* You can now call the new `useHeadlessBrowser` method on the `HttpLoader` class to use a headless Chrome browser to load pages. This is enough to get HTML after executing javascript in the browser. For more sophisticated tasks a separate Loader and/or Steps should better be created. * With the `maxOutputs()` method of the abstract `Step` class you can now limit how many outputs a certain step should yield at max. That's for example helpful during development, when you want to run the crawler only with a small subset of the data/requests it will actually have to process when you eventually remove the limits. When a step has reached its limit, it won't even call the `invoke()` method any longer until the step is reset after a run. * With the new `outputHook()` method of the abstract `Crawler` class you can set a closure that'll receive all the outputs from all the steps. Should be only for debugging reasons. * The `extract()` method of the `Html` and `Xml` (children of `Dom`) steps now also works with a single selector instead of an array with a mapping. Sometimes you'll want to just get a simple string output e.g. for a next step, instead of an array with mapped extracted data. diff --git a/composer.json b/composer.json index 5dcc35af..f8f057e1 100644 --- a/composer.json +++ b/composer.json @@ -37,7 +37,8 @@ "symfony/css-selector": "^6.0", "psr/simple-cache": "^1.0|^2.0|^3.0", "guzzlehttp/guzzle": "^7.4", - "adbario/php-dot-notation": "^3.1" + "adbario/php-dot-notation": "^3.1", + "chrome-php/chrome": "^1.6" }, "require-dev": { "pestphp/pest": "^1.21", diff --git a/src/Loader/Http/HttpLoader.php b/src/Loader/Http/HttpLoader.php index f041c81b..abecfe50 100644 --- a/src/Loader/Http/HttpLoader.php +++ b/src/Loader/Http/HttpLoader.php @@ -13,6 +13,13 @@ use Exception; use GuzzleHttp\Client; use GuzzleHttp\Psr7\Request; +use GuzzleHttp\Psr7\Response; +use HeadlessChromium\Browser; +use HeadlessChromium\BrowserFactory; +use HeadlessChromium\Exception\CommunicationException; +use HeadlessChromium\Exception\NavigationExpired; +use HeadlessChromium\Exception\NoResponseAvailable; +use HeadlessChromium\Exception\OperationTimedOut; use InvalidArgumentException; use Psr\Http\Client\ClientExceptionInterface; use Psr\Http\Client\ClientInterface; @@ -24,9 +31,24 @@ class HttpLoader extends Loader { protected ClientInterface $httpClient; + protected CookieJar $cookieJar; + protected bool $useCookies = true; + protected bool $useHeadlessBrowser = false; + + /** + * @var mixed[] + */ + protected array $headlessBrowserOptions = [ + 'windowSize' => [1920, 1000], + ]; + + protected bool $headlessBrowserOptionsDirty = false; + + protected ?Browser $headlessBrowser = null; + public function __construct( UserAgentInterface $userAgent, ?ClientInterface $httpClient = null, @@ -65,14 +87,16 @@ public function load(mixed $subject): ?RespondedRequest } $request = $this->prepareRequest($request); + $this->callHook('beforeLoad', $request); try { $respondedRequest = $this->getFromCache($request); + $isFromCache = $respondedRequest !== null; if (!$respondedRequest) { - $respondedRequest = $this->handleRedirects($request); + $respondedRequest = $this->loadViaClientOrHeadlessBrowser($request); } if ($respondedRequest->response->getStatusCode() < 400) { @@ -83,12 +107,14 @@ public function load(mixed $subject): ?RespondedRequest if (!$isFromCache && $this->cache) { $responseCacheItem = HttpResponseCacheItem::fromAggregate($respondedRequest); + $this->cache->set($responseCacheItem->key(), $responseCacheItem); } return $respondedRequest; } catch (Throwable $exception) { $this->trackRequestEnd(); // Don't move to finally so hooks don't run before it. + $this->callHook('onError', $request, $exception); return null; @@ -99,20 +125,33 @@ public function load(mixed $subject): ?RespondedRequest /** * @throws ClientExceptionInterface + * @throws CommunicationException + * @throws CommunicationException\CannotReadResponse + * @throws CommunicationException\InvalidResponse + * @throws CommunicationException\ResponseHasError * @throws LoadingException + * @throws NavigationExpired + * @throws NoResponseAvailable + * @throws OperationTimedOut + * @throws Throwable * @throws \Psr\SimpleCache\InvalidArgumentException */ public function loadOrFail(mixed $subject): RespondedRequest { $request = $this->validateSubjectType($subject); + $this->isAllowedToBeLoaded($request->getUri(), true); + $request = $this->prepareRequest($request); + $this->callHook('beforeLoad', $request); + $respondedRequest = $this->getFromCache($request); + $isFromCache = $respondedRequest !== null; if (!$respondedRequest) { - $respondedRequest = $this->handleRedirects($request); + $respondedRequest = $this->loadViaClientOrHeadlessBrowser($request); } if ($respondedRequest->response->getStatusCode() >= 400) { @@ -120,10 +159,12 @@ public function loadOrFail(mixed $subject): RespondedRequest } $this->callHook('onSuccess', $request, $respondedRequest->response); + $this->callHook('afterLoad', $request); if (!$isFromCache && $this->cache) { $responseCacheItem = HttpResponseCacheItem::fromAggregate($respondedRequest); + $this->cache->set($responseCacheItem->key(), $responseCacheItem); } @@ -142,6 +183,48 @@ public function flushCookies(): void $this->cookieJar->flush(); } + public function useHeadlessBrowser(): static + { + $this->useHeadlessBrowser = true; + + return $this; + } + + public function useHttpClient(): static + { + $this->useHeadlessBrowser = false; + + $this->headlessBrowser = null; + + return $this; + } + + /** + * @param mixed[] $options + */ + public function setHeadlessBrowserOptions(array $options): static + { + $this->headlessBrowserOptions = $options; + + $this->headlessBrowserOptionsDirty = true; + + return $this; + } + + /** + * @param mixed[] $options + */ + public function addHeadlessBrowserOptions(array $options): static + { + foreach ($options as $key => $value) { + $this->headlessBrowserOptions[$key] = $value; + } + + $this->headlessBrowserOptionsDirty = true; + + return $this; + } + /** * @throws \Psr\SimpleCache\InvalidArgumentException */ @@ -155,6 +238,7 @@ protected function getFromCache(RequestInterface $request): ?RespondedRequest if ($this->cache->has($key)) { $this->logger->info('Found ' . $request->getUri()->__toString() . ' in cache.'); + $responseCacheItem = $this->cache->get($key); return $responseCacheItem->aggregate(); @@ -179,9 +263,30 @@ protected function validateSubjectType(RequestInterface|string $requestOrUri): R protected function prepareRequest(RequestInterface $request): RequestInterface { $request = $request->withHeader('User-Agent', $this->userAgent->__toString()); - $request = $this->addCookiesToRequest($request); - return $request; + return $this->addCookiesToRequest($request); + } + + /** + * @param RequestInterface $request + * @return RespondedRequest + * @throws ClientExceptionInterface + * @throws CommunicationException + * @throws CommunicationException\CannotReadResponse + * @throws CommunicationException\InvalidResponse + * @throws CommunicationException\ResponseHasError + * @throws NavigationExpired + * @throws NoResponseAvailable + * @throws OperationTimedOut + * @throws Throwable + */ + private function loadViaClientOrHeadlessBrowser(RequestInterface $request): RespondedRequest + { + if ($this->useHeadlessBrowser) { + return $this->loadViaHeadlessBrowser($request); + } + + return $this->handleRedirects($request); } /** @@ -192,7 +297,9 @@ private function handleRedirects( ?RespondedRequest $aggregate = null ): RespondedRequest { $this->trackRequestStart(); + $response = $this->httpClient->sendRequest($request); + $this->trackRequestEnd(); if (!$aggregate) { @@ -205,6 +312,7 @@ private function handleRedirects( if ($aggregate->isRedirect()) { $this->logger()->info('Load redirect to: ' . $aggregate->effectiveUri()); + $newRequest = $request->withUri(Url::parsePsr7($aggregate->effectiveUri())); return $this->handleRedirects($newRequest, $aggregate); @@ -213,6 +321,67 @@ private function handleRedirects( return $aggregate; } + /** + * @param RequestInterface $request + * @return RespondedRequest + * @throws CommunicationException + * @throws CommunicationException\CannotReadResponse + * @throws CommunicationException\InvalidResponse + * @throws CommunicationException\ResponseHasError + * @throws NavigationExpired + * @throws NoResponseAvailable + * @throws OperationTimedOut + * @throws Throwable + */ + private function loadViaHeadlessBrowser(RequestInterface $request): RespondedRequest + { + $browser = $this->getBrowser($request); + + $page = $browser->createPage(); + + $statusCode = 500; + + $responseHeaders = []; + + $page->getSession()->once( + "method:Network.responseReceived", + function ($params) use (& $statusCode, & $responseHeaders) { + $statusCode = $params['response']['status']; + + $responseHeaders = $this->sanitizeResponseHeaders($params['response']['headers']); + } + ); + + $page->navigate($request->getUri()->__toString()) + ->waitForNavigation(); + + $html = $page->getHtml(); + + return new RespondedRequest( + $request, + new Response($statusCode, $responseHeaders, $html) + ); + } + + private function getBrowser(RequestInterface $request): Browser + { + if (!$this->headlessBrowser || $this->headlessBrowserOptionsDirty) { + $this->headlessBrowser?->close(); + + $options = $this->headlessBrowserOptions; + + $options['userAgent'] = $this->userAgent->__toString(); + + $options['headers'] = array_merge($options['headers'] ?? [], $request->getHeaders()); + + $this->headlessBrowser = (new BrowserFactory())->createBrowser($options); + + $this->headlessBrowserOptionsDirty = false; + } + + return $this->headlessBrowser; + } + private function addCookiesToJar(RespondedRequest $aggregate): void { if ($this->useCookies) { @@ -236,4 +405,17 @@ private function addCookiesToRequest(RequestInterface $request): RequestInterfac return $request; } + + /** + * @param string[] $headers + * @return string[] + */ + private function sanitizeResponseHeaders(array $headers): array + { + foreach ($headers as $key => $value) { + $headers[$key] = explode(PHP_EOL, $value)[0]; + } + + return $headers; + } } diff --git a/src/Loader/Loader.php b/src/Loader/Loader.php index 46ed65e3..d84d9b98 100644 --- a/src/Loader/Loader.php +++ b/src/Loader/Loader.php @@ -11,6 +11,7 @@ abstract class Loader implements LoaderInterface { protected LoggerInterface $logger; + protected ?CacheInterface $cache = null; /** diff --git a/tests/_Integration/Http/HeadlessBrowserTest.php b/tests/_Integration/Http/HeadlessBrowserTest.php new file mode 100644 index 00000000..66adb9d9 --- /dev/null +++ b/tests/_Integration/Http/HeadlessBrowserTest.php @@ -0,0 +1,114 @@ +useHeadlessBrowser(); + + return $loader; + } +} + +class GetJsonFromResponseHtmlBody extends Step +{ + protected function invoke(mixed $input): Generator + { + $html = Http::getBodyString($input->response); + + $jsonString = (new Crawler($html))->filter('body pre')->text(); + + yield json_decode($jsonString, true); + } +} + +class GetStringFromResponseHtmlBody extends Step +{ + protected function invoke(mixed $input): Generator + { + $html = Http::getBodyString($input->response); + + yield (new Crawler($html))->filter('body')->text(); + } +} + +it('automatically uses the Loader\'s user agent', function () { + $crawler = new HeadlessBrowserCrawler(); + + $crawler->input('http://localhost:8000/print-headers') + ->addStep(Http::get()) + ->addStep('responseBody', new GetJsonFromResponseHtmlBody()); + + $results = helper_generatorToArray($crawler->run()); + + expect($results)->toHaveCount(1); + + expect($results[0]->get('responseBody'))->toBeArray(); + + expect($results[0]->get('responseBody'))->toHaveKey('User-Agent'); + + expect($results[0]->get('responseBody')['User-Agent'])->toBe('Mozilla/5.0 (compatible; HeadlessBrowserBot)'); +}); + +it('uses cookies', function () { + $crawler = new HeadlessBrowserCrawler(); + + $crawler->input('http://localhost:8000/set-cookie') + ->addStep(Http::get()) + ->addStep(new class () extends Step { + protected function invoke(mixed $input): Generator + { + yield 'http://localhost:8000/print-cookie'; + } + }) + ->addStep(Http::get()) + ->addStep('printed-cookie', new GetStringFromResponseHtmlBody()); + + $results = helper_generatorToArray($crawler->run()); + + expect($results)->toHaveCount(1); + + expect($results[0]->get('printed-cookie'))->toBeString(); + + expect($results[0]->get('printed-cookie'))->toBe('foo123'); +}); + +it('renders javascript', function () { + $crawler = new HeadlessBrowserCrawler(); + + $crawler->input('http://localhost:8000/js-rendering') + ->addStep(Http::get()) + ->addStep( + Html::root() + ->extract(['content' => '#content p']) + ); + + $results = helper_generatorToArray($crawler->run()); + + expect($results)->toHaveCount(1); + + expect($results[0]->toArray())->toBe([ + 'content' => 'This was added through javascript', + ]); +}); diff --git a/tests/_Integration/Server.php b/tests/_Integration/Server.php index 26cb9a15..af241ae6 100644 --- a/tests/_Integration/Server.php +++ b/tests/_Integration/Server.php @@ -36,3 +36,19 @@ function getParamAfter(string $route, string $after): string if ($route === '/blog-post-with-json-ld') { return include(__DIR__ . '/_Server/BlogPostWithJsonLd.php'); } + +if ($route === '/js-rendering') { + return include(__DIR__ . '/_Server/JsGeneratedContent.php'); +} + +if ($route === '/print-headers') { + return include(__DIR__ . '/_Server/PrintHeaders.php'); +} + +if ($route === '/set-cookie') { + return include(__DIR__ . '/_Server/SetCookie.php'); +} + +if ($route === '/print-cookie') { + return include(__DIR__ . '/_Server/PrintCookie.php'); +} diff --git a/tests/_Integration/_Server/JsGeneratedContent.php b/tests/_Integration/_Server/JsGeneratedContent.php new file mode 100644 index 00000000..330f46f8 --- /dev/null +++ b/tests/_Integration/_Server/JsGeneratedContent.php @@ -0,0 +1,13 @@ + + + + JS Generated Content + + +
+
+ + + diff --git a/tests/_Integration/_Server/PrintCookie.php b/tests/_Integration/_Server/PrintCookie.php new file mode 100644 index 00000000..b96d01ca --- /dev/null +++ b/tests/_Integration/_Server/PrintCookie.php @@ -0,0 +1,3 @@ +