Skip to content

Commit

Permalink
Enable the use of Proxies
Browse files Browse the repository at this point in the history
Add new methods `HttpLoader::useProxy()` and
`HttpLoader::useRotatingProxies([...])` to define proxies that the
loader shall use. They can be used with a guzzle HTTP client instance
(default) and when the loader uses the headless chrome browser. Using
them when providing some other PSR-18 implementation will throw an
exception.
(see #99)

Also, fix the `HttpLoader::load()` implementation won't throw any
exception, because it shouldn't kill a crawler run. When you want any
loading error to end the whole crawler execution
`HttpLoader::loadOrFail()` should be used. Also adapted the phpdoc in
the `LoaderInterface`.
  • Loading branch information
otsch committed Sep 28, 2023
1 parent 63550d6 commit a250ad2
Show file tree
Hide file tree
Showing 9 changed files with 439 additions and 56 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
## Added
* New methods `HttpLoader::useProxy()` and `HttpLoader::useRotatingProxies([...])` to define proxies that the loader shall use. They can be used with a guzzle HTTP client instance (default) and when the loader uses the headless chrome browser. Using them when providing some other PSR-18 implementation will throw an exception.

### Fixed
* The `HttpLoader::load()` implementation won't throw any exception, because it shouldn't kill a crawler run. When you want any loading error to end the whole crawler execution `HttpLoader::loadOrFail()` should be used. Also adapted the phpdoc in the `LoaderInterface`.

## [1.2.2] - 2023-09-19
### Fixed
Expand Down
13 changes: 12 additions & 1 deletion src/Loader/Http/Exceptions/LoadingException.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,16 @@
namespace Crwlr\Crawler\Loader\Http\Exceptions;

use Exception;
use Throwable;

class LoadingException extends Exception {}
class LoadingException extends Exception
{
public static function from(Throwable $previousException): self
{
return new self(
'Loading failed. Exception of type ' . get_class($previousException) . ' was thrown. Exception message: ' .
$previousException->getMessage(),
previous: $previousException,
);
}
}
169 changes: 117 additions & 52 deletions src/Loader/Http/HttpLoader.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
use Error;
use Exception;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\GuzzleException;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use HeadlessChromium\Browser;
Expand Down Expand Up @@ -79,6 +80,8 @@ class HttpLoader extends Loader
*/
protected array $cacheUrlFilters = [];

protected ?ProxyManager $proxies = null;

/**
* @param mixed[] $defaultGuzzleClientConfig
*/
Expand Down Expand Up @@ -123,22 +126,26 @@ public function __construct(
/**
* @param mixed $subject
* @return RespondedRequest|null
* @throws LoadingException
* @throws Exception
*/
public function load(mixed $subject): ?RespondedRequest
{
$request = $this->validateSubjectType($subject);
try {
$request = $this->validateSubjectType($subject);
} catch (InvalidArgumentException) {
$this->logger->error('Invalid input URL: ' . var_export($subject, true));

if (!$this->isAllowedToBeLoaded($request->getUri())) {
return null;
}

$request = $this->prepareRequest($request);
try {
if (!$this->isAllowedToBeLoaded($request->getUri())) {
return null;
}

$this->callHook('beforeLoad', $request);
$request = $this->prepareRequest($request);

$this->callHook('beforeLoad', $request);

try {
$respondedRequest = $this->getFromCache($request);

$isFromCache = $respondedRequest !== null;
Expand Down Expand Up @@ -174,54 +181,45 @@ public function load(mixed $subject): ?RespondedRequest
}
}

/**
* @throws ClientExceptionInterface
* @throws CommunicationException
* @throws CommunicationException\CannotReadResponse
* @throws CommunicationException\InvalidResponse
* @throws CommunicationException\ResponseHasError
* @throws LoadingException
* @throws NavigationExpired
* @throws NoResponseAvailable
* @throws OperationTimedOut
* @throws Throwable
* @throws \Psr\SimpleCache\InvalidArgumentException
*/
public function loadOrFail(mixed $subject): RespondedRequest
{
$request = $this->validateSubjectType($subject);

$this->isAllowedToBeLoaded($request->getUri(), true);
try {
$this->isAllowedToBeLoaded($request->getUri(), true);

$request = $this->prepareRequest($request);
$request = $this->prepareRequest($request);

$this->callHook('beforeLoad', $request);
$this->callHook('beforeLoad', $request);

$respondedRequest = $this->getFromCache($request);
$respondedRequest = $this->getFromCache($request);

$isFromCache = $respondedRequest !== null;
$isFromCache = $respondedRequest !== null;

if ($isFromCache) {
$this->callHook('onCacheHit', $request, $respondedRequest->response);
}
if ($isFromCache) {
$this->callHook('onCacheHit', $request, $respondedRequest->response);
}

if (!$respondedRequest) {
$respondedRequest = $this->waitForGoAndLoadViaClientOrHeadlessBrowser($request);
}
if (!$respondedRequest) {
$respondedRequest = $this->waitForGoAndLoadViaClientOrHeadlessBrowser($request);
}

if ($respondedRequest->response->getStatusCode() >= 400) {
throw new LoadingException('Failed to load ' . $request->getUri()->__toString());
}
if ($respondedRequest->response->getStatusCode() >= 400) {
throw new LoadingException('Failed to load ' . $request->getUri()->__toString());
}

$this->callHook('onSuccess', $request, $respondedRequest->response);
$this->callHook('onSuccess', $request, $respondedRequest->response);

$this->callHook('afterLoad', $request);
$this->callHook('afterLoad', $request);

if (!$isFromCache) {
$this->addToCache($respondedRequest);
}
if (!$isFromCache) {
$this->addToCache($respondedRequest);
}

return $respondedRequest;
return $respondedRequest;
} catch (Throwable $exception) {
throw LoadingException::from($exception);
}
}

public function dontUseCookies(): static
Expand Down Expand Up @@ -328,6 +326,41 @@ public function cacheOnlyWhereUrl(FilterInterface $filter): static
return $this;
}

/**
* @throws Exception
*/
public function useProxy(string $proxyUrl): void
{
$this->checkIfProxiesCanBeUsed();

$this->proxies = new ProxyManager([$proxyUrl]);
}

/**
* @param string[] $proxyUrls
* @throws Exception
*/
public function useRotatingProxies(array $proxyUrls): void
{
$this->checkIfProxiesCanBeUsed();

$this->proxies = new ProxyManager($proxyUrls);
}

/**
* @return void
* @throws Exception
*/
protected function checkIfProxiesCanBeUsed(): void
{
if (!$this->usesHeadlessBrowser() && !$this->httpClient instanceof Client) {
throw new Exception(
'The included proxy feature can only be used when using a guzzle HTTP client or headless chrome ' .
'browser for loading.'
);
}
}

/**
* @param mixed[] $config
* @return mixed[]
Expand Down Expand Up @@ -424,12 +457,15 @@ protected function shouldResponseBeCached(RespondedRequest $respondedRequest): b

/**
* @throws InvalidArgumentException
* @throws InvalidUrlException
*/
protected function validateSubjectType(RequestInterface|string $requestOrUri): RequestInterface
{
if (is_string($requestOrUri)) {
return new Request('GET', Url::parsePsr7($requestOrUri));
try {
return new Request('GET', Url::parsePsr7($requestOrUri));
} catch (InvalidUrlException) {
throw new InvalidArgumentException('Invalid URL.');
}
}

return $requestOrUri;
Expand All @@ -454,7 +490,6 @@ protected function prepareRequest(RequestInterface $request): RequestInterface
}

/**
* @return RespondedRequest
* @throws ClientExceptionInterface
* @throws CommunicationException
* @throws CommunicationException\CannotReadResponse
Expand All @@ -464,7 +499,8 @@ protected function prepareRequest(RequestInterface $request): RequestInterface
* @throws NavigationExpired
* @throws NoResponseAvailable
* @throws OperationTimedOut
* @throws Throwable
* @throws GuzzleException
* @throws Exception
*/
protected function waitForGoAndLoadViaClientOrHeadlessBrowser(RequestInterface $request): RespondedRequest
{
Expand All @@ -487,17 +523,17 @@ protected function waitForGoAndLoadViaClientOrHeadlessBrowser(RequestInterface $
}

/**
* @param RequestInterface $request
* @return RespondedRequest
* @throws ClientExceptionInterface
* @throws CommunicationException
* @throws CommunicationException\CannotReadResponse
* @throws CommunicationException\InvalidResponse
* @throws CommunicationException\ResponseHasError
* @throws GuzzleException
* @throws LoadingException
* @throws NavigationExpired
* @throws NoResponseAvailable
* @throws OperationTimedOut
* @throws Throwable
* @throws Exception
*/
protected function loadViaClientOrHeadlessBrowser(RequestInterface $request): RespondedRequest
{
Expand All @@ -511,6 +547,7 @@ protected function loadViaClientOrHeadlessBrowser(RequestInterface $request): Re
/**
* @throws ClientExceptionInterface
* @throws LoadingException
* @throws GuzzleException
*/
protected function handleRedirects(
RequestInterface $request,
Expand All @@ -525,7 +562,11 @@ protected function handleRedirects(
$this->throttler->trackRequestStartFor($request->getUri());
}

$response = $this->httpClient->sendRequest($request);
if ($this->proxies && $this->httpClient instanceof Client) {
$response = $this->sendProxiedRequestUsingGuzzle($request, $this->httpClient);
} else {
$response = $this->httpClient->sendRequest($request);
}

if (!$respondedRequest) {
$respondedRequest = new RespondedRequest($request, $response);
Expand All @@ -551,16 +592,31 @@ protected function handleRedirects(
}

/**
* @param RequestInterface $request
* @return RespondedRequest
* @throws GuzzleException
*/
protected function sendProxiedRequestUsingGuzzle(RequestInterface $request, Client $client): ResponseInterface
{
return $client->request(
$request->getMethod(),
$request->getUri(),
[
'headers' => $request->getHeaders(),
'proxy' => $this->proxies?->getProxy(),
'version' => $request->getProtocolVersion(),
'body' => $request->getBody(),
],
);
}

/**
* @throws CommunicationException
* @throws CommunicationException\CannotReadResponse
* @throws CommunicationException\InvalidResponse
* @throws CommunicationException\ResponseHasError
* @throws NavigationExpired
* @throws NoResponseAvailable
* @throws OperationTimedOut
* @throws Throwable
* @throws Exception
*/
protected function loadViaHeadlessBrowser(RequestInterface $request): RespondedRequest
{
Expand Down Expand Up @@ -602,7 +658,7 @@ function ($params) use (& $statusCode, & $responseHeaders) {
*/
protected function getBrowser(RequestInterface $request): Browser
{
if (!$this->headlessBrowser || $this->headlessBrowserOptionsDirty) {
if (!$this->headlessBrowser || $this->shouldRenewHeadlessBrowserInstance()) {
$this->headlessBrowser?->close();

$options = $this->headlessBrowserOptions;
Expand All @@ -614,6 +670,10 @@ protected function getBrowser(RequestInterface $request): Browser
$this->prepareRequestHeadersForHeadlessBrowser($request->getHeaders()),
);

if (!empty($this->proxies)) {
$options['proxyServer'] = $this->proxies->getProxy();
}

$this->headlessBrowser = (new BrowserFactory($this->chromeExecutable))->createBrowser($options);

$this->headlessBrowserOptionsDirty = false;
Expand All @@ -622,6 +682,11 @@ protected function getBrowser(RequestInterface $request): Browser
return $this->headlessBrowser;
}

protected function shouldRenewHeadlessBrowserInstance(): bool
{
return $this->headlessBrowserOptionsDirty || ($this->proxies && $this->proxies->hasMultipleProxies());
}

protected function addCookiesToJar(RespondedRequest $respondedRequest): void
{
if ($this->useCookies) {
Expand Down
46 changes: 46 additions & 0 deletions src/Loader/Http/ProxyManager.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
<?php

namespace Crwlr\Crawler\Loader\Http;

class ProxyManager
{
protected ?int $lastUsedProxy = null;

/**
* @param string[] $proxies
*/
public function __construct(protected array $proxies)
{
$this->proxies = array_values($this->proxies);
}

public function singleProxy(): bool
{
return count($this->proxies) === 1;
}

public function hasOnlySingleProxy(): bool
{
return count($this->proxies) === 1;
}

public function hasMultipleProxies(): bool
{
return count($this->proxies) > 1;
}

public function getProxy(): string
{
if ($this->hasOnlySingleProxy()) {
return $this->proxies[0];
}

if ($this->lastUsedProxy === null || !isset($this->proxies[$this->lastUsedProxy + 1])) {
$this->lastUsedProxy = 0;
} else {
$this->lastUsedProxy += 1;
}

return $this->proxies[$this->lastUsedProxy];
}
}
Loading

0 comments on commit a250ad2

Please sign in to comment.