From ed818cd4f80b57dd1cb6e6d5c16e987f4d189f6a Mon Sep 17 00:00:00 2001 From: otsch Date: Thu, 28 Sep 2023 16:12:18 +0200 Subject: [PATCH] Enable the use of Proxies Add new methods `HttpLoader::useProxy()` and `HttpLoader::useRotatingProxies([...])` to define proxies that the loader shall use. They can be used with a guzzle HTTP client instance (default) and when the loader uses the headless chrome browser. Using them when providing some other PSR-18 implementation will throw an exception. (see https://github.com/crwlrsoft/crawler/issues/99) Also, fix the `HttpLoader::load()` implementation won't throw any exception, because it shouldn't kill a crawler run. When you want any loading error to end the whole crawler execution `HttpLoader::loadOrFail()` should be used. Also adapted the phpdoc in the `LoaderInterface`. --- CHANGELOG.md | 5 + .../Http/Exceptions/LoadingException.php | 13 +- src/Loader/Http/HttpLoader.php | 169 ++++++++++----- src/Loader/Http/ProxyManager.php | 46 ++++ src/Loader/LoaderInterface.php | 2 - tests/Loader/Http/ProxyManagerTest.php | 43 ++++ .../Paginators/SimpleWebsitePaginatorTest.php | 2 +- tests/_Integration/Http/ProxyingTest.php | 202 ++++++++++++++++++ tests/_Integration/ProxyServer.php | 13 ++ 9 files changed, 439 insertions(+), 56 deletions(-) create mode 100644 src/Loader/Http/ProxyManager.php create mode 100644 tests/Loader/Http/ProxyManagerTest.php create mode 100644 tests/_Integration/Http/ProxyingTest.php create mode 100644 tests/_Integration/ProxyServer.php diff --git a/CHANGELOG.md b/CHANGELOG.md index bbeaa9e..f623654 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +## Added +* New methods `HttpLoader::useProxy()` and `HttpLoader::useRotatingProxies([...])` to define proxies that the loader shall use. They can be used with a guzzle HTTP client instance (default) and when the loader uses the headless chrome browser. Using them when providing some other PSR-18 implementation will throw an exception. + +### Fixed +* The `HttpLoader::load()` implementation won't throw any exception, because it shouldn't kill a crawler run. When you want any loading error to end the whole crawler execution `HttpLoader::loadOrFail()` should be used. Also adapted the phpdoc in the `LoaderInterface`. ## [1.2.2] - 2023-09-19 ### Fixed diff --git a/src/Loader/Http/Exceptions/LoadingException.php b/src/Loader/Http/Exceptions/LoadingException.php index 59a1933..3839310 100644 --- a/src/Loader/Http/Exceptions/LoadingException.php +++ b/src/Loader/Http/Exceptions/LoadingException.php @@ -3,5 +3,16 @@ namespace Crwlr\Crawler\Loader\Http\Exceptions; use Exception; +use Throwable; -class LoadingException extends Exception {} +class LoadingException extends Exception +{ + public static function from(Throwable $previousException): self + { + return new self( + 'Loading failed. Exception of type ' . get_class($previousException) . ' was thrown. Exception message: ' . + $previousException->getMessage(), + previous: $previousException, + ); + } +} diff --git a/src/Loader/Http/HttpLoader.php b/src/Loader/Http/HttpLoader.php index 54e5a6e..4a2cdaf 100644 --- a/src/Loader/Http/HttpLoader.php +++ b/src/Loader/Http/HttpLoader.php @@ -16,6 +16,7 @@ use Error; use Exception; use GuzzleHttp\Client; +use GuzzleHttp\Exception\GuzzleException; use GuzzleHttp\Psr7\Request; use GuzzleHttp\Psr7\Response; use HeadlessChromium\Browser; @@ -79,6 +80,8 @@ class HttpLoader extends Loader */ protected array $cacheUrlFilters = []; + protected ?ProxyManager $proxies = null; + /** * @param mixed[] $defaultGuzzleClientConfig */ @@ -123,22 +126,26 @@ public function __construct( /** * @param mixed $subject * @return RespondedRequest|null - * @throws LoadingException - * @throws Exception */ public function load(mixed $subject): ?RespondedRequest { - $request = $this->validateSubjectType($subject); + try { + $request = $this->validateSubjectType($subject); + } catch (InvalidArgumentException) { + $this->logger->error('Invalid input URL: ' . var_export($subject, true)); - if (!$this->isAllowedToBeLoaded($request->getUri())) { return null; } - $request = $this->prepareRequest($request); + try { + if (!$this->isAllowedToBeLoaded($request->getUri())) { + return null; + } - $this->callHook('beforeLoad', $request); + $request = $this->prepareRequest($request); + + $this->callHook('beforeLoad', $request); - try { $respondedRequest = $this->getFromCache($request); $isFromCache = $respondedRequest !== null; @@ -174,54 +181,45 @@ public function load(mixed $subject): ?RespondedRequest } } - /** - * @throws ClientExceptionInterface - * @throws CommunicationException - * @throws CommunicationException\CannotReadResponse - * @throws CommunicationException\InvalidResponse - * @throws CommunicationException\ResponseHasError - * @throws LoadingException - * @throws NavigationExpired - * @throws NoResponseAvailable - * @throws OperationTimedOut - * @throws Throwable - * @throws \Psr\SimpleCache\InvalidArgumentException - */ public function loadOrFail(mixed $subject): RespondedRequest { $request = $this->validateSubjectType($subject); - $this->isAllowedToBeLoaded($request->getUri(), true); + try { + $this->isAllowedToBeLoaded($request->getUri(), true); - $request = $this->prepareRequest($request); + $request = $this->prepareRequest($request); - $this->callHook('beforeLoad', $request); + $this->callHook('beforeLoad', $request); - $respondedRequest = $this->getFromCache($request); + $respondedRequest = $this->getFromCache($request); - $isFromCache = $respondedRequest !== null; + $isFromCache = $respondedRequest !== null; - if ($isFromCache) { - $this->callHook('onCacheHit', $request, $respondedRequest->response); - } + if ($isFromCache) { + $this->callHook('onCacheHit', $request, $respondedRequest->response); + } - if (!$respondedRequest) { - $respondedRequest = $this->waitForGoAndLoadViaClientOrHeadlessBrowser($request); - } + if (!$respondedRequest) { + $respondedRequest = $this->waitForGoAndLoadViaClientOrHeadlessBrowser($request); + } - if ($respondedRequest->response->getStatusCode() >= 400) { - throw new LoadingException('Failed to load ' . $request->getUri()->__toString()); - } + if ($respondedRequest->response->getStatusCode() >= 400) { + throw new LoadingException('Failed to load ' . $request->getUri()->__toString()); + } - $this->callHook('onSuccess', $request, $respondedRequest->response); + $this->callHook('onSuccess', $request, $respondedRequest->response); - $this->callHook('afterLoad', $request); + $this->callHook('afterLoad', $request); - if (!$isFromCache) { - $this->addToCache($respondedRequest); - } + if (!$isFromCache) { + $this->addToCache($respondedRequest); + } - return $respondedRequest; + return $respondedRequest; + } catch (Throwable $exception) { + throw LoadingException::from($exception); + } } public function dontUseCookies(): static @@ -328,6 +326,41 @@ public function cacheOnlyWhereUrl(FilterInterface $filter): static return $this; } + /** + * @throws Exception + */ + public function useProxy(string $proxyUrl): void + { + $this->checkIfProxiesCanBeUsed(); + + $this->proxies = new ProxyManager([$proxyUrl]); + } + + /** + * @param string[] $proxyUrls + * @throws Exception + */ + public function useRotatingProxies(array $proxyUrls): void + { + $this->checkIfProxiesCanBeUsed(); + + $this->proxies = new ProxyManager($proxyUrls); + } + + /** + * @return void + * @throws Exception + */ + protected function checkIfProxiesCanBeUsed(): void + { + if (!$this->usesHeadlessBrowser() && !$this->httpClient instanceof Client) { + throw new Exception( + 'The included proxy feature can only be used when using a guzzle HTTP client or headless chrome ' . + 'browser for loading.' + ); + } + } + /** * @param mixed[] $config * @return mixed[] @@ -424,12 +457,15 @@ protected function shouldResponseBeCached(RespondedRequest $respondedRequest): b /** * @throws InvalidArgumentException - * @throws InvalidUrlException */ protected function validateSubjectType(RequestInterface|string $requestOrUri): RequestInterface { if (is_string($requestOrUri)) { - return new Request('GET', Url::parsePsr7($requestOrUri)); + try { + return new Request('GET', Url::parsePsr7($requestOrUri)); + } catch (InvalidUrlException) { + throw new InvalidArgumentException('Invalid URL.'); + } } return $requestOrUri; @@ -454,7 +490,6 @@ protected function prepareRequest(RequestInterface $request): RequestInterface } /** - * @return RespondedRequest * @throws ClientExceptionInterface * @throws CommunicationException * @throws CommunicationException\CannotReadResponse @@ -464,7 +499,8 @@ protected function prepareRequest(RequestInterface $request): RequestInterface * @throws NavigationExpired * @throws NoResponseAvailable * @throws OperationTimedOut - * @throws Throwable + * @throws GuzzleException + * @throws Exception */ protected function waitForGoAndLoadViaClientOrHeadlessBrowser(RequestInterface $request): RespondedRequest { @@ -487,17 +523,17 @@ protected function waitForGoAndLoadViaClientOrHeadlessBrowser(RequestInterface $ } /** - * @param RequestInterface $request - * @return RespondedRequest * @throws ClientExceptionInterface * @throws CommunicationException * @throws CommunicationException\CannotReadResponse * @throws CommunicationException\InvalidResponse * @throws CommunicationException\ResponseHasError + * @throws GuzzleException + * @throws LoadingException * @throws NavigationExpired * @throws NoResponseAvailable * @throws OperationTimedOut - * @throws Throwable + * @throws Exception */ protected function loadViaClientOrHeadlessBrowser(RequestInterface $request): RespondedRequest { @@ -511,6 +547,7 @@ protected function loadViaClientOrHeadlessBrowser(RequestInterface $request): Re /** * @throws ClientExceptionInterface * @throws LoadingException + * @throws GuzzleException */ protected function handleRedirects( RequestInterface $request, @@ -525,7 +562,11 @@ protected function handleRedirects( $this->throttler->trackRequestStartFor($request->getUri()); } - $response = $this->httpClient->sendRequest($request); + if ($this->proxies && $this->httpClient instanceof Client) { + $response = $this->sendProxiedRequestUsingGuzzle($request, $this->httpClient); + } else { + $response = $this->httpClient->sendRequest($request); + } if (!$respondedRequest) { $respondedRequest = new RespondedRequest($request, $response); @@ -551,8 +592,23 @@ protected function handleRedirects( } /** - * @param RequestInterface $request - * @return RespondedRequest + * @throws GuzzleException + */ + protected function sendProxiedRequestUsingGuzzle(RequestInterface $request, Client $client): ResponseInterface + { + return $client->request( + $request->getMethod(), + $request->getUri(), + [ + 'headers' => $request->getHeaders(), + 'proxy' => $this->proxies?->getProxy(), + 'version' => $request->getProtocolVersion(), + 'body' => $request->getBody(), + ], + ); + } + + /** * @throws CommunicationException * @throws CommunicationException\CannotReadResponse * @throws CommunicationException\InvalidResponse @@ -560,7 +616,7 @@ protected function handleRedirects( * @throws NavigationExpired * @throws NoResponseAvailable * @throws OperationTimedOut - * @throws Throwable + * @throws Exception */ protected function loadViaHeadlessBrowser(RequestInterface $request): RespondedRequest { @@ -602,7 +658,7 @@ function ($params) use (& $statusCode, & $responseHeaders) { */ protected function getBrowser(RequestInterface $request): Browser { - if (!$this->headlessBrowser || $this->headlessBrowserOptionsDirty) { + if (!$this->headlessBrowser || $this->shouldRenewHeadlessBrowserInstance()) { $this->headlessBrowser?->close(); $options = $this->headlessBrowserOptions; @@ -614,6 +670,10 @@ protected function getBrowser(RequestInterface $request): Browser $this->prepareRequestHeadersForHeadlessBrowser($request->getHeaders()), ); + if (!empty($this->proxies)) { + $options['proxyServer'] = $this->proxies->getProxy(); + } + $this->headlessBrowser = (new BrowserFactory($this->chromeExecutable))->createBrowser($options); $this->headlessBrowserOptionsDirty = false; @@ -622,6 +682,11 @@ protected function getBrowser(RequestInterface $request): Browser return $this->headlessBrowser; } + protected function shouldRenewHeadlessBrowserInstance(): bool + { + return $this->headlessBrowserOptionsDirty || ($this->proxies && $this->proxies->hasMultipleProxies()); + } + protected function addCookiesToJar(RespondedRequest $respondedRequest): void { if ($this->useCookies) { diff --git a/src/Loader/Http/ProxyManager.php b/src/Loader/Http/ProxyManager.php new file mode 100644 index 0000000..c978391 --- /dev/null +++ b/src/Loader/Http/ProxyManager.php @@ -0,0 +1,46 @@ +proxies = array_values($this->proxies); + } + + public function singleProxy(): bool + { + return count($this->proxies) === 1; + } + + public function hasOnlySingleProxy(): bool + { + return count($this->proxies) === 1; + } + + public function hasMultipleProxies(): bool + { + return count($this->proxies) > 1; + } + + public function getProxy(): string + { + if ($this->hasOnlySingleProxy()) { + return $this->proxies[0]; + } + + if ($this->lastUsedProxy === null || !isset($this->proxies[$this->lastUsedProxy + 1])) { + $this->lastUsedProxy = 0; + } else { + $this->lastUsedProxy += 1; + } + + return $this->proxies[$this->lastUsedProxy]; + } +} diff --git a/src/Loader/LoaderInterface.php b/src/Loader/LoaderInterface.php index 45f3b96..b9d661b 100644 --- a/src/Loader/LoaderInterface.php +++ b/src/Loader/LoaderInterface.php @@ -11,8 +11,6 @@ interface LoaderInterface /** * @param mixed $subject The subject to load, whatever the Loader implementation needs to load something. * @return mixed - * @throws InvalidArgumentException Throw an InvalidArgumentException when the type of $subject argument isn't - * valid for the Loader implementation. */ public function load(mixed $subject): mixed; diff --git a/tests/Loader/Http/ProxyManagerTest.php b/tests/Loader/Http/ProxyManagerTest.php new file mode 100644 index 0000000..e24cc7c --- /dev/null +++ b/tests/Loader/Http/ProxyManagerTest.php @@ -0,0 +1,43 @@ +hasOnlySingleProxy()) + ->toBeTrue() + ->and($manager->hasMultipleProxies()) + ->toBeFalse(); + + $manager = new ProxyManager(['http://127.0.0.1:8001', 'http://127.0.0.1:8002']); + + expect($manager->hasOnlySingleProxy()) + ->toBeFalse() + ->and($manager->hasMultipleProxies()) + ->toBeTrue(); +}); + +it('returns the proxy when only one is defined', function () { + $manager = new ProxyManager(['http://127.0.0.1:8003']); + + expect($manager->getProxy()) + ->toBe('http://127.0.0.1:8003') + ->and($manager->getProxy()) + ->toBe('http://127.0.0.1:8003'); +}); + +it('rotates the proxies when multiple are defined', function () { + $manager = new ProxyManager(['http://127.0.0.1:8001', 'http://127.0.0.1:8002', 'http://127.0.0.1:8003']); + + expect($manager->getProxy()) + ->toBe('http://127.0.0.1:8001') + ->and($manager->getProxy()) + ->toBe('http://127.0.0.1:8002') + ->and($manager->getProxy()) + ->toBe('http://127.0.0.1:8003') + ->and($manager->getProxy()) + ->toBe('http://127.0.0.1:8001'); +}); diff --git a/tests/Steps/Loading/Http/Paginators/SimpleWebsitePaginatorTest.php b/tests/Steps/Loading/Http/Paginators/SimpleWebsitePaginatorTest.php index b6eeabe..4cf605a 100644 --- a/tests/Steps/Loading/Http/Paginators/SimpleWebsitePaginatorTest.php +++ b/tests/Steps/Loading/Http/Paginators/SimpleWebsitePaginatorTest.php @@ -27,7 +27,7 @@ function helper_createResponseBodyWithPaginationLinks(array $links): string $body = ''; diff --git a/tests/_Integration/Http/ProxyingTest.php b/tests/_Integration/Http/ProxyingTest.php new file mode 100644 index 0000000..089bc83 --- /dev/null +++ b/tests/_Integration/Http/ProxyingTest.php @@ -0,0 +1,202 @@ + + */ + public static array $processes = [8001 => null, 8002 => null, 8003 => null]; +} + +beforeEach(function () { + $startedProcesses = false; + + foreach (ProxyServerProcesses::PORTS as $port) { + if (!ProxyServerProcesses::$processes[$port]) { + ProxyServerProcesses::$processes[$port] = Process::fromShellCommandline( + 'php -S localhost:' . $port . ' ' . __DIR__ . '/../ProxyServer.php' + ); + + ProxyServerProcesses::$processes[$port]->start(); + + $startedProcesses = true; + } + } + + if ($startedProcesses) { + usleep(100_000); + } +}); + +afterAll(function () { + foreach (ProxyServerProcesses::PORTS as $port) { + ProxyServerProcesses::$processes[$port]?->stop(3, SIGINT); + + ProxyServerProcesses::$processes[$port] = null; + } +}); + +it('uses a proxy when the useProxy() method of the loader was called', function () { + $crawler = helper_getFastCrawler(); + + $loader = $crawler->getLoader(); + + /** @var HttpLoader $loader */ + + $loader->useProxy('http://localhost:8001'); + + $crawler + ->input('http://www.crwlr.software/packages') + ->addStep(Http::get()->addToResult(['body'])); + + $results = iterator_to_array($crawler->run()); + + expect($results[0]) + ->toBeInstanceOf(Result::class) + ->and($results[0]->get('body')) + ->toContain('Proxy Server Response for http://www.crwlr.software/packages'); +}); + +it('uses correct method, headers and HTTP version in the proxied request', function () { + $crawler = helper_getFastCrawler(); + + $loader = $crawler->getLoader(); + + /** @var HttpLoader $loader */ + + $loader->useProxy('http://localhost:8001'); + + $crawler + ->input('http://www.crwlr.software/packages') + ->addStep( + Http::put(['Accept-Encoding' => 'gzip, deflate, br'], 'Hello World', '1.0') + ->addToResult(['body']) + ); + + $results = iterator_to_array($crawler->run()); + + expect($results[0]) + ->toBeInstanceOf(Result::class) + ->and($results[0]->get('body')) + ->toContain('Protocol Version: HTTP/1.0') + ->toContain('Request Method: PUT') + ->toContain('Request Body: Hello World') + ->toContain('["Accept-Encoding"]=>' . PHP_EOL . ' string(17) "gzip, deflate, br"'); +}); + +it('uses rotating proxies when the useRotatingProxies() method of the loader was called', function () { + $crawler = helper_getFastCrawler(); + + $loader = $crawler->getLoader(); + + /** @var HttpLoader $loader */ + + $loader->useRotatingProxies([ + 'http://localhost:8001', + 'http://localhost:8002', + 'http://localhost:8003', + ]); + + $crawler + ->input([ + 'http://www.crwlr.software/packages/crawler/v1.1/getting-started', + 'http://www.crwlr.software/packages/url/v2.0/getting-started', + 'http://www.crwlr.software/packages/query-string/v1.0/getting-started', + 'http://www.crwlr.software/packages/robots-txt/v1.1/getting-started', + ]) + ->addStep(Http::get()->addToResult(['body'])); + + $results = iterator_to_array($crawler->run()); + + expect($results)->toHaveCount(4) + ->and($results[0]) + ->toBeInstanceOf(Result::class) + ->and($results[0]->get('body')) + ->toContain('Port: 8001') // First request with first proxy + ->and($results[1]) + ->toBeInstanceOf(Result::class) + ->and($results[1]->get('body')) + ->toContain('Port: 8002') // Second request with second proxy + ->and($results[2]) + ->toBeInstanceOf(Result::class) + ->and($results[2]->get('body')) + ->toContain('Port: 8003') // Third request with third proxy + ->and($results[3]) + ->toBeInstanceOf(Result::class) + ->and($results[3]->get('body')) + ->toContain('Port: 8001'); // And finally the fourth request with the first proxy again. +}); + +it('can also use a proxy when using the headless browser', function () { + $crawler = helper_getFastCrawler(); + + $loader = $crawler->getLoader(); + + /** @var HttpLoader $loader */ + + $loader + ->useHeadlessBrowser() + ->useProxy('http://localhost:8001'); + + $crawler + ->input('http://www.crwlr.software/blog') + ->addStep( + Http::get(['Accept-Language' => 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7']) + ->addToResult(['body']) + ); + + $results = iterator_to_array($crawler->run()); + + expect($results[0]) + ->toBeInstanceOf(Result::class) + ->and($results[0]->get('body')) + ->toContain('["Accept-Language"]=>' . PHP_EOL . ' string(35) "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"'); +}); + +it('can also use rotating proxies when using the headless browser', function () { + $crawler = helper_getFastCrawler(); + + $loader = $crawler->getLoader(); + + /** @var HttpLoader $loader */ + + $loader + ->useHeadlessBrowser() + ->useRotatingProxies([ + 'http://localhost:8001', + 'http://localhost:8002', + ]); + + $crawler + ->input([ + 'http://www.crwlr.software/packages/crawler/v1.1', + 'http://www.crwlr.software/packages/url/v2.0', + 'http://www.crwlr.software/packages/query-string/v1.0', + ]) + ->addStep(Http::get()->addToResult(['body'])); + + $results = iterator_to_array($crawler->run()); + + expect($results)->toHaveCount(3) + ->and($results[0]) + ->toBeInstanceOf(Result::class) + ->and($results[0]->get('body')) + ->toContain('Port: 8001') // First request with first proxy + ->and($results[1]) + ->toBeInstanceOf(Result::class) + ->and($results[1]->get('body')) + ->toContain('Port: 8002') // Second request with second proxy + ->and($results[2]) + ->toBeInstanceOf(Result::class) + ->and($results[2]->get('body')) + ->toContain('Port: 8001'); // And finally the third request with the first proxy again. +}); diff --git a/tests/_Integration/ProxyServer.php b/tests/_Integration/ProxyServer.php new file mode 100644 index 0000000..df642db --- /dev/null +++ b/tests/_Integration/ProxyServer.php @@ -0,0 +1,13 @@ +