Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: status-codes (fixes #161) #164

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions demo.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
<?php
spekulatius marked this conversation as resolved.
Show resolved Hide resolved

require __DIR__ . '/vendor/autoload.php';

// ENTER YOUR URL HERE:
$url = 'http://github.com/spekulatius/PHPScraper';
echo 'requesting ', $url, "\n";
$web = new \Spekulatius\PHPScraper\PHPScraper;
$web->go($url);
//var_dump($web->client);

if($web->currentUrl !== $url)
echo 'redirected to ', $web->currentUrl, "\n";
echo 'status code ', $web->statusCode, "\n";



if($web->isGone) {
echo "delete/deactivate record from database\n";
} else {
if($web->permanentRedirectUrl !== '') {
echo 'url changed - update url in database to ', $web->permanentRedirectUrl, "\n";
}

$retryAt = $web->retryAt;
if($web->isSuccess) {
echo "got data successfully - process it now...\n";
} elseif($web->isTemporaryResult) {
echo "temporary error\n";
if(!$retryAt)
$retryAt = time() + 15*60; // FIXME: use longer times if we get the same status code multiple times
} else {
echo "might be a permanent error - but who knows if the server changes its mind (e.g. if the result is caused by some administrative work on the server) --> try several times before considering it final\n";
if(!$retryAt)
$retryAt = time() + 24*60*60; // FIXME: use longer times if we get the same status code multiple times OR consider it somewhen really permanent and delete/deactivate record from database
}
if($retryAt)
echo 'retry at ', date('Y-m-d H:i:s', $retryAt), "\n";
}
102 changes: 102 additions & 0 deletions src/GoutteClient.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
<?php

namespace Spekulatius\PHPScraper;

use Goutte\Client;
use Symfony\Component\DomCrawler\Crawler;

/**
* Extended Goutte\Client with PHPScraper specific methods
*/

class GoutteClient extends Client
{
/**
* Was a temporary redirect involved in loading this request?
*
* @var bool
*/
public $usesTemporaryRedirect = false;

/**
* Should subsequent requests go to a different URL?
*
* @var string
*/
public $permanentRedirectUrl = null;

/**
* Which is the earliest moment to retry the request because of an outdated redirect? (unix timestamp)
*
* @var int
*/
protected $retryRedirectAt = PHP_INT_MAX;

/**
* Which is the earliest moment to retry the request because of a failed request? (unix timestamp)
*
* @var int
*/
protected $retryFailureAt = 0;

/**
* Remember permanent redirect url and detect if the redirect chain contains temporary redirects
*
* @return Crawler
*/
public function followRedirect(): Crawler
spekulatius marked this conversation as resolved.
Show resolved Hide resolved
{
$status = $this->internalResponse->getStatusCode();
if($status === 200 /* META REFRESH */ || $status === 301 /* Moved Permanently */ || $status === 308 /* Permanent Redirect */) {
if(!$this->usesTemporaryRedirect && empty($this->internalResponse->getHeader('Retry-After')))
$this->permanentRedirectUrl = $this->redirect;
} else { // $status === 300 /* Multiple Choices */ || $status === 302 /* Found */ || $status === 303 /* See Other */ || $status === 307 /* Temporary Redirect */
$this->usesTemporaryRedirect = true;
}
// 300 Multiple Choices might also be handled as permanent redirect
spekulatius marked this conversation as resolved.
Show resolved Hide resolved
// META REFRESH might also be handled as temporary redirect if the delay is > 1s
return parent::followRedirect();
}

/**
* Evaluate the Retry-After header
*
* see https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Retry-After
*
* @return Response
*/
protected function filterResponse(object $response)
{
$retryAfterHeaders = $response->getHeader('Retry-After', false);
if(!empty($retryAfterHeaders)) {
$status = $this->internalResponse->getStatusCode();
foreach($retryAfterHeaders as $retryAfter) {
if(is_numeric($retryAfter))
$retryAt = time() + $retryAfter;
else
$retryAt = strtotime($retryAfter);
if($status >= 400) { // usually 429 Too Many Request or 503 Service Unavailable
if($this->retryFailureAt < $retryAt)
$this->retryFailureAt = $retryAt;
} elseif($status >= 300) {
if($this->retryRedirectAt > $retryAt)
$this->retryRedirectAt = $retryAt;
}
}
}
return parent::filterResponse($response);
}

/**
* Calculate the earliest moment to retry the request
*
* @return Response
spekulatius marked this conversation as resolved.
Show resolved Hide resolved
*/
public function retryAt(): int {
spekulatius marked this conversation as resolved.
Show resolved Hide resolved
if($this->retryFailureAt)
return $this->retryFailureAt;
if($this->retryRedirectAt < PHP_INT_MAX)
return $this->retryRedirectAt;
return 0;
}
}
1 change: 0 additions & 1 deletion src/PHPScraper.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
* Most calls are passed through to the Core class.
*/

use Goutte\Client as GoutteClient;
use Symfony\Component\HttpClient\HttpClient as SymfonyHttpClient;

class PHPScraper
Expand Down
70 changes: 69 additions & 1 deletion src/UsesGoutte.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

namespace Spekulatius\PHPScraper;

use Goutte\Client as GoutteClient;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Contracts\HttpClient\HttpClientInterface;

Expand Down Expand Up @@ -133,4 +132,73 @@ public function clickLink($titleOrUrl): self

return $this;
}

public function usesTemporaryRedirect(): bool
{
return $this->client ? $this->client->usesTemporaryRedirect : false;
}

public function isTemporaryResult(): bool
{
return $this->usesTemporaryRedirect() || \in_array($this->statusCode(), [408 /* Request Timeout */, 409 /* Conflict */, 419 /* Page Expired */, 420 /* Enhance Your Calm */, 421 /* Misdirected Request */, 423 /* Locked */, 425 /* Too Early */, 429 /* Too Many Requests */, 500 /* Internal Server Error */, 502 /* Bad Gateway */, 503 /* Service Unavailable */, 504 /* Gateway Timeout */, 507 /* Insufficient Storage */, 520 /* Web Server returned an unknown error */, 521 /* Web server is down */, 522 /* Connection Timed Out */, 523 /* Origin is unreachable */, 524 /* A timeout occurred */, 525 /* SSL Handshake Failed */, 527 /* Railgun Error */, 529 /* Site is overloaded */, 598 /* Network read timeout error */, 599 /* Network Connect Timeout Error */ ]);
spekulatius marked this conversation as resolved.
Show resolved Hide resolved
}

public function isGone(): bool
{
return !$this->isTemporaryResult() && $this->statusCode() === 410 /* Gone */;
}

public function isPermanentError(): bool
{
return $this->statusCode() >= 400 && !$this->isTemporaryResult();
}

public function permanentRedirectUrl(): string
{
return $this->client ? ($this->client->permanentRedirectUrl ?? '') : '';
}

public function retryAt(): int
{
$retryAt = $this->client ? ($this->client->retryAt()) : 0;
if($retryAt)
return $retryAt;
if($this->statusCode() === 509 /* Bandwidth Limit Exceeded */)
return strtotime('next month 12:00 UTC'); // give providers in each timezone the chance to reset the traffic quota for month
return 0;
}

public function statusCode(): int
{
if ($this->currentPage === null) {
throw new \Exception('You can not access the status code before your first navigation using `go`.');
}

return $this->client->getResponse()->getStatusCode();
}

public function isSuccess(): bool
{
return $this->statusCode() >= 200 && $this->statusCode() <= 299;
}

public function isClientError(): bool
{
return $this->statusCode() >= 400 && $this->statusCode() <= 499;
}

public function isServerError(): bool
{
return $this->statusCode() >= 500 && $this->statusCode() <= 599;
}

public function isForbidden(): bool
{
return $this->statusCode() === 403;
}

public function isNotFound(): bool
{
return $this->statusCode() === 404;
}
}
22 changes: 0 additions & 22 deletions tests/NotFoundTest.php

This file was deleted.

65 changes: 65 additions & 0 deletions tests/StatusCodeTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
<?php

namespace Spekulatius\PHPScraper\Tests;

class StatusCodeTest extends \PHPUnit\Framework\TestCase
{
/**
* @test
*/
public function testAccessErrorBeforeNavigation()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;

$this->expectException(\Exception::class);
$this->expectExceptionMessage('You can not access the status code before your first navigation using `go`.');

$web->statusCode;
}

/**
* @test
*/
public function testOk()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;

// Navigate to the test page: This redirects to phpscraper.de
$web->go('https://phpscraper.de');

// Check the status itself.
$this->assertSame(200, $web->statusCode);

// Check the detailed states.
$this->assertTrue($web->isSuccess);
$this->assertFalse($web->isClientError);
$this->assertFalse($web->isServerError);

// Assert access-helpers
$this->assertFalse($web->isForbidden);
$this->assertFalse($web->isNotFound);
}

/**
* @test
*/
public function testNotFound()
{
$web = new \Spekulatius\PHPScraper\PHPScraper;

// Navigate to the test page which doesn't exist.
$web->go('https://test-pages.phpscraper.de/page-does-not-exist.html');

// Check the status itself.
$this->assertSame(404, $web->statusCode);

// Check the detailed states.
$this->assertFalse($web->isSuccess);
$this->assertTrue($web->isClientError);
$this->assertFalse($web->isServerError);

// Assert access-helpers
$this->assertFalse($web->isForbidden);
$this->assertTrue($web->isNotFound);
}
}