Skip to content

Commit

Permalink
Use JS response when needed
Browse files Browse the repository at this point in the history
  • Loading branch information
Baspa committed Aug 17, 2023
1 parent 1fb6e09 commit 681c7b1
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 4 deletions.
1 change: 1 addition & 0 deletions resources/lang/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"failed.content.broken_images": "The page contains broken images. These images were found: :actualValue.",
"failed.content.broken_links": "The page contains broken links. These links were found: :actualValue.",
"failed.content.length": "The content is :actualValue characters long. It should be at least :expectedValue characters long.",
"failed.content.length.parse": "We were unable to parse the content of this page, please try again.",
"failed.content.mixed_content": "The page contains links to insecure addresses, while it should not. These links were found :actualValue.",
"failed.content.multiple_h1": "The page contains multiple h1 tags, while it should not. These tags were found :actualValue.",
"failed.content.no_heading": "The page does not contain any h1 tag, while it should.",
Expand Down
27 changes: 23 additions & 4 deletions src/Checks/Content/ContentLengthCheck.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public function check(Response $response, Crawler $crawler): bool
return true;
}

$content = $this->getContentToValidate($response);
$content = $this->getContentToValidate($response, $crawler);

if (! $content) {
return true;
Expand All @@ -47,15 +47,34 @@ public function check(Response $response, Crawler $crawler): bool
return $this->validateContent($content);
}

public function getContentToValidate(Response $response): ?string
public function getContentToValidate(Response $response, Crawler $crawler): ?string
{
$url = $response->transferStats->getHandlerStats()['url'];

$readability = new Readability($response->body(), $url);
$body = $response->body();

if ($this->useJavascript) {
$body = $crawler->filter('body')->html();
}

$readability = new Readability($body, $url);

$readability->init();

return $readability->getContent()->textContent ?? null;
$textContent = $readability->getContent()->textContent;

/**
* This is a fallback for when Readability is unable to parse the content.
* Sometimes it happens when scanning a JavaScript rendered page, that
* we don't get a proper response. In that case we just return null.
* @todo we should check if we can improve this.
*/
if ($textContent == 'Sorry, Readability was unable to parse this page for content.') {
$this->failureReason = __('failed.content.length.parse');
return null;
}

return $textContent ?? null;

Check failure on line 77 in src/Checks/Content/ContentLengthCheck.php

View workflow job for this annotation

GitHub Actions / phpstan

Variable $textContent on left side of ?? always exists and is not nullable.
}

public function validateContent(string|array $content): bool
Expand Down

0 comments on commit 681c7b1

Please sign in to comment.