diff --git a/examples/CheckStatus.php b/examples/CheckStatus.php new file mode 100644 index 0000000..d4a5eba --- /dev/null +++ b/examples/CheckStatus.php @@ -0,0 +1,40 @@ +go($url); + +if ($web->currentUrl !== $url) { + echo 'redirected to ', $web->currentUrl, "\n"; +} +echo 'status code ', $web->statusCode, "\n"; + +if ($web->isGone) { + echo "delete/deactivate record from database\n"; +} else { + if ($web->permanentRedirectUrl !== '') { + echo 'url changed - update url in database to ', $web->permanentRedirectUrl, "\n"; + } + + $retryAt = $web->retryAt; + if ($web->isSuccess) { + echo "got data successfully - process it now...\n"; + } elseif ($web->isTemporaryResult) { + echo "temporary error\n"; + if (!$retryAt) { + $retryAt = time() + 15*60; + } // FIXME: use longer times if we get the same status code multiple times + } else { + echo "might be a permanent error - but who knows if the server changes its mind (e.g. if the result is caused by some administrative work on the server) --> try several times before considering it final\n"; + if (!$retryAt) { + $retryAt = time() + 24*60*60; + } // FIXME: use longer times if we get the same status code multiple times OR consider it somewhen really permanent and delete/deactivate record from database + } + if ($retryAt) { + echo 'retry at ', date('Y-m-d H:i:s', $retryAt), "\n"; + } +} diff --git a/src/GoutteClient.php b/src/GoutteClient.php new file mode 100644 index 0000000..2710a88 --- /dev/null +++ b/src/GoutteClient.php @@ -0,0 +1,160 @@ +isMainRequest) { + $this->usesTemporaryRedirect = false; + $this->permanentRedirectUrl = null; + $this->retryRedirectAt = PHP_INT_MAX; + $this->retryFailureAt = 0; + } + try { + return parent::request($method, $uri, $parameters, $files, $server, $content, $changeHistory); + } catch (TimeoutException $e) { + $content = $e->getMessage(); + $status = 499; // Client Closed Request + } catch (TransportExceptionInterface $e) { + $content = $e->getMessage(); + $status = 0; // Network Error + } + $this->response = new Response($content, $status, ['Content-Type' => 'text/plain', 'Content-Length' => strlen($content), 'Date' => gmdate('D, d M Y H:i:s T')]); + $this->internalResponse = $this->filterResponse($this->response); + $this->redirect = null; + $this->crawler = $this->createCrawlerFromContent($this->internalRequest->getUri(), $this->internalResponse->getContent(), $this->internalResponse->getHeader('Content-Type') ?? ''); + return $this->crawler; + } + + /** + * Remember permanent redirect url and detect if the redirect chain contains temporary redirects + * + * @return Crawler + */ + public function followRedirect(): Crawler + { + $this->isMainRequest = false; + $status = $this->internalResponse->getStatusCode(); + if ($status === 200 /* META REFRESH */ || $status === 301 /* Moved Permanently */ || $status === 308 /* Permanent Redirect */) { + if (!$this->usesTemporaryRedirect && empty($this->internalResponse->getHeader('Retry-After'))) { + $this->permanentRedirectUrl = $this->redirect; + } + } else { // $status === 300 /* Multiple Choices */ || $status === 302 /* Found */ || $status === 303 /* See Other */ || $status === 307 /* Temporary Redirect */ + $this->usesTemporaryRedirect = true; + } + // 300 Multiple Choices might also be handled as permanent redirect + // META REFRESH might also be handled as temporary redirect if the delay is > 1s + $response = parent::followRedirect(); + $this->isMainRequest = true; + return $response; + } + + /** + * Evaluate the Retry-After header + * + * see https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Retry-After + * + * @return Response + */ + protected function filterResponse(object $response) + { + $retryAfterHeaders = $response->getHeader('Retry-After', false); + if (!empty($retryAfterHeaders)) { + $status = $response->getStatusCode(); + foreach ($retryAfterHeaders as $retryAfter) { + if (is_numeric($retryAfter)) { + $retryAt = time() + $retryAfter; + } else { + $retryAt = strtotime($retryAfter); + } + if ($status >= 400) { // usually 429 Too Many Request or 503 Service Unavailable + if ($this->retryFailureAt < $retryAt) { + $this->retryFailureAt = $retryAt; + } + } elseif ($status >= 300) { + if ($this->retryRedirectAt > $retryAt) { + $this->retryRedirectAt = $retryAt; + } + } + } + } + + return parent::filterResponse($response); + } + + /** + * Calculate the earliest moment to retry the request + * + * @return int + */ + public function retryAt(): int + { + if ($this->retryFailureAt) { + return $this->retryFailureAt; + } + if ($this->retryRedirectAt < PHP_INT_MAX) { + return $this->retryRedirectAt; + } + + return 0; + } +} diff --git a/src/PHPScraper.php b/src/PHPScraper.php index 8fe88be..f517a7c 100644 --- a/src/PHPScraper.php +++ b/src/PHPScraper.php @@ -8,7 +8,6 @@ * Most calls are passed through to the Core class. */ -use Goutte\Client as GoutteClient; use Symfony\Component\HttpClient\HttpClient as SymfonyHttpClient; class PHPScraper diff --git a/src/UsesGoutte.php b/src/UsesGoutte.php index 5f9be19..2fc68a8 100644 --- a/src/UsesGoutte.php +++ b/src/UsesGoutte.php @@ -2,7 +2,6 @@ namespace Spekulatius\PHPScraper; -use Goutte\Client as GoutteClient; use Symfony\Component\DomCrawler\Crawler; use Symfony\Contracts\HttpClient\HttpClientInterface; @@ -133,4 +132,81 @@ public function clickLink($titleOrUrl): self return $this; } -} \ No newline at end of file + + public function isTemporaryResult(): bool + { + return $this->usesTemporaryRedirect() || \in_array($this->statusCode(), [ + 408, // Request Timeout + 409, // Conflict + 419, // Page Expired + 420, // Enhance Your Calm + 421, // Misdirected Request + 423, // Locked + 425, // Too Early + 429, // Too Many Requests + 499, // Client Closed Request (Timeout) + 500, // Internal Server Error + 502, // Bad Gateway + 503, // Service Unavailable + 504, // Gateway Timeout + 507, // Insufficient Storage + 520, // Web Server returned an unknown error + 521, // Web Server is down + 522, // Connection Timed Out + 523, // Origin is unreachable + 524, // A timeout occurred + 525, // SSL Handshake Failed + 527, // Railgun Error + 529, // Site is overloaded + 598, // Network read timeout error + 599, // Network Connect Timeout Error + ]); + } + + public function isGone(): bool + { + return !$this->isTemporaryResult() && $this->statusCode() === 410 /* Gone */; + } + + public function isPermanentError(): bool + { + return (!$this->statusCode() || $this->statusCode() >= 400) && !$this->isTemporaryResult(); + } + + public function usesTemporaryRedirect(): bool + { + return $this->client ? $this->client->usesTemporaryRedirect : false; + } + + public function permanentRedirectUrl(): string + { + return $this->client ? ($this->client->permanentRedirectUrl ?? '') : ''; + } + + public function retryAt(): int + { + $retryAt = $this->client ? ($this->client->retryAt()) : 0; + if ($retryAt) { + return $retryAt; + } + if ($this->statusCode() === 509 /* Bandwidth Limit Exceeded */) { + return strtotime('next month 12:00 UTC'); + } // give providers in each timezone the chance to reset the traffic quota for month + return 0; + } + + public function statusCode(): int + { + if ($this->currentPage === null) { + throw new \Exception('You can not access the status code before your first navigation using `go`.'); + } + + return $this->client->getResponse()->getStatusCode(); + } + + public function isSuccess(): bool + { + return $this->statusCode() >= 200 && $this->statusCode() <= 299; + } + +} diff --git a/tests/NotFoundTest.php b/tests/NotFoundTest.php deleted file mode 100644 index 1bd1ce1..0000000 --- a/tests/NotFoundTest.php +++ /dev/null @@ -1,22 +0,0 @@ -go('https://test-pages.phpscraper.de/page-does-not-exist.html'); - - // The built-in server returns this string. - $this->assertSame('Page Not Found', $web->title); - } -} diff --git a/tests/StatusCodeTest.php b/tests/StatusCodeTest.php new file mode 100644 index 0000000..e8c2771 --- /dev/null +++ b/tests/StatusCodeTest.php @@ -0,0 +1,223 @@ +expectException(\Exception::class); + $this->expectExceptionMessage('You can not access the status code before your first navigation using `go`.'); + + $web->statusCode; + } + + /** + * @test + */ + public function testOk() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + // Navigate to the test page without redirect + $web->go('https://phpscraper.de/'); + + // Check the status itself. + $this->assertSame(200, $web->statusCode); + + // Check the detailed states. + $this->assertTrue($web->isSuccess); + $this->assertFalse($web->isTemporaryResult); + $this->assertFalse($web->isGone); + $this->assertFalse($web->isPermanentError); + + // Check the request properties + $this->assertFalse($web->usesTemporaryRedirect); + $this->assertSame('', $web->permanentRedirectUrl); + $this->assertSame(0, $web->retryAt); + } + + /** + * @test + */ + public function testNotFound() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + // Navigate to the test page which doesn't exist. + $web->go('https://test-pages.phpscraper.de/page-does-not-exist.html'); + + // Check the status itself. + $this->assertSame(404, $web->statusCode); + + // Check the detailed states. + $this->assertFalse($web->isSuccess); + $this->assertFalse($web->isTemporaryResult); + $this->assertFalse($web->isGone); + $this->assertTrue($web->isPermanentError); + + // Check the request properties + $this->assertFalse($web->usesTemporaryRedirect); + $this->assertSame('', $web->permanentRedirectUrl); + $this->assertSame(0, $web->retryAt); + } + + /** + * @test + */ + public function testPermanentRedirect() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + // Navigate to the test page with 301 permanent redirect + $web->go('http://phpscraper.de/'); + + // Check the status itself. + $this->assertSame(200, $web->statusCode); + + // Check the detailed states. + $this->assertTrue($web->isSuccess); + $this->assertFalse($web->isTemporaryResult); + $this->assertFalse($web->isGone); + $this->assertFalse($web->isPermanentError); + + // Check the request properties + $this->assertFalse($web->usesTemporaryRedirect); + $this->assertSame('https://phpscraper.de/', $web->permanentRedirectUrl); + $this->assertSame(0, $web->retryAt); + } + + /** + * @test + */ + public function testTemporaryRedirect() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + // Navigate to the test page with 307 temporary redirect + $web->go('https://httpstat.us/307'); + + // Check the status itself. + $this->assertSame(200, $web->statusCode); + + // Check the detailed states. + $this->assertTrue($web->isSuccess); + $this->assertTrue($web->isTemporaryResult); + $this->assertFalse($web->isGone); + $this->assertFalse($web->isPermanentError); + + // Check the request properties + $this->assertTrue($web->usesTemporaryRedirect); + $this->assertSame('', $web->permanentRedirectUrl); + $this->assertSame(0, $web->retryAt); + } + + /** + * @test + */ + public function testGone() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + // Navigate to the test page + $web->go('https://httpstat.us/410'); + + // Check the status itself. + $this->assertSame(410, $web->statusCode); + + // Check the detailed states. + $this->assertFalse($web->isSuccess); + $this->assertFalse($web->isTemporaryResult); + $this->assertTrue($web->isGone); + $this->assertTrue($web->isPermanentError); + + // Check the request properties + $this->assertFalse($web->usesTemporaryRedirect); + $this->assertSame('', $web->permanentRedirectUrl); + $this->assertSame(0, $web->retryAt); + } + + /** + * @test + */ + public function testTooManyRequests() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + // Navigate to the test page which returns "429 Too Many Requests" with "Retry-At: 5" header + $t1 = time(); + $web->go('https://httpstat.us/429'); + $t2 = time(); + + // Check the status itself. + $this->assertSame(429, $web->statusCode); + + // Check the detailed states. + $this->assertFalse($web->isSuccess); + $this->assertTrue($web->isTemporaryResult); + $this->assertFalse($web->isGone); + $this->assertFalse($web->isPermanentError); + + // Check the request properties + $this->assertFalse($web->usesTemporaryRedirect); + $this->assertSame('', $web->permanentRedirectUrl); + $this->assertGreaterThan($t1, $web->retryAt); + $this->assertLessThanOrEqual($t2 + 5, $web->retryAt); + } + + /** + * @test + */ + public function testNetworkError() + { + $web = new \Spekulatius\PHPScraper\PHPScraper; + + // Navigate to the test page which is invalid + $web->go('https://example.tld/'); + + // Check the status itself. + $this->assertSame(0, $web->statusCode); + + // Check the detailed states. + $this->assertFalse($web->isSuccess); + $this->assertFalse($web->isTemporaryResult); + $this->assertFalse($web->isGone); + $this->assertTrue($web->isPermanentError); + + // Check the request properties + $this->assertFalse($web->usesTemporaryRedirect); + $this->assertSame('', $web->permanentRedirectUrl); + $this->assertSame(0, $web->retryAt); + } + + /** + * @test + */ + public function testTimeout() + { + $web = new \Spekulatius\PHPScraper\PHPScraper(['timeout' => 0]); + + // Navigate to the test page + $web->go('https://phpscraper.de/'); + + // Check the status itself. + $this->assertSame(499, $web->statusCode); + + // Check the detailed states. + $this->assertFalse($web->isSuccess); + $this->assertTrue($web->isTemporaryResult); + $this->assertFalse($web->isGone); + $this->assertFalse($web->isPermanentError); + + // Check the request properties + $this->assertFalse($web->usesTemporaryRedirect); + $this->assertSame('', $web->permanentRedirectUrl); + $this->assertSame(0, $web->retryAt); + } + +}