Skip to content

Commit

Permalink
Validate CSS Selectors and XPath Queries earlier
Browse files Browse the repository at this point in the history
When creating a `CssSelector` or `XPathQuery` instance with invalid
selector/query syntax, an `InvalidDomQueryException` is now immediately
thrown. This change is considered to be not only non-breaking, but
actually a fix, because the `CssSelector` would otherwise throw an
exception later when the `apply()` method is called. The `XPathQuery`
would silently return no result without notifying you of the invalid
query and generate a PHP warning.
  • Loading branch information
otsch committed Dec 1, 2023
1 parent e632daa commit 69a1949
Show file tree
Hide file tree
Showing 8 changed files with 125 additions and 3 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [1.3.2] - 2023-12-01
### Fixed
* When creating a `CssSelector` or `XPathQuery` instance with invalid selector/query syntax, an `InvalidDomQueryException` is now immediately thrown. This change is considered to be not only non-breaking, but actually a fix, because the `CssSelector` would otherwise throw an exception later when the `apply()` method is called. The `XPathQuery` would silently return no result without notifying you of the invalid query and generate a PHP warning.

## [1.3.1] - 2023-11-30
### Fixed
* Support usage with the new Symfony major version v7.
Expand Down
18 changes: 18 additions & 0 deletions src/Steps/Html/CssSelector.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,28 @@

namespace Crwlr\Crawler\Steps\Html;

use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use Symfony\Component\CssSelector\CssSelectorConverter;
use Symfony\Component\CssSelector\Exception\ExpressionErrorException;
use Symfony\Component\CssSelector\Exception\SyntaxErrorException;
use Symfony\Component\DomCrawler\Crawler;

final class CssSelector extends DomQuery
{
/**
* @throws InvalidDomQueryException
*/
public function __construct(string $query)
{
try {
(new CssSelectorConverter())->toXPath($query);
} catch (ExpressionErrorException|SyntaxErrorException $exception) {
throw InvalidDomQueryException::fromSymfonyException($query, $exception);
}

parent::__construct($query);
}

public function filter(Crawler $domCrawler): Crawler
{
return $domCrawler->filter($this->query);
Expand Down
41 changes: 41 additions & 0 deletions src/Steps/Html/Exceptions/InvalidDomQueryException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
<?php

namespace Crwlr\Crawler\Steps\Html\Exceptions;

use Exception;
use Symfony\Component\CssSelector\Exception\ExpressionErrorException;
use Symfony\Component\CssSelector\Exception\SyntaxErrorException;

class InvalidDomQueryException extends Exception
{
protected string $query = '';

public static function make(string $message, string $domQuery): self
{
$exception = new self($message);

$exception->setDomQuery($domQuery);

return $exception;
}

public static function fromSymfonyException(
string $domQuery,
ExpressionErrorException|SyntaxErrorException $originalException,
): self {
$exception = new self(
$originalException->getMessage(),
$originalException->getCode(),
$originalException,
);

$exception->setDomQuery($domQuery);

return $exception;
}

public function setDomQuery(string $domQuery): void
{
$this->query = $domQuery;
}
}
13 changes: 11 additions & 2 deletions src/Steps/Html/GetLink.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,12 @@ class GetLink extends Step

protected bool $withFragment = true;

public function __construct(protected ?string $selector = null) {}
protected null|string|CssSelector $selector = null;

public function __construct(null|string|CssSelector $selector = null)
{
$this->selector = is_string($selector) ? new CssSelector($selector) : $selector;
}

public static function isSpecialNonHttpLink(Crawler $linkElement): bool
{
Expand Down Expand Up @@ -65,7 +70,11 @@ protected function invoke(mixed $input): Generator

$selector = $this->selector ?? 'a';

foreach ($input->filter($selector) as $link) {
if (is_string($selector)) {
$selector = new CssSelector($selector);
}

foreach ($selector->filter($input) as $link) {
$linkUrl = $this->getLinkUrl($link);

if ($linkUrl) {
Expand Down
6 changes: 5 additions & 1 deletion src/Steps/Html/GetLinks.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ protected function invoke(mixed $input): Generator

$selector = $this->selector ?? 'a';

foreach ($input->filter($selector) as $link) {
if (is_string($selector)) {
$selector = new CssSelector($selector);
}

foreach ($selector->filter($input) as $link) {
$linkUrl = $this->getLinkUrl($link);

if ($linkUrl) {
Expand Down
36 changes: 36 additions & 0 deletions src/Steps/Html/XPathQuery.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,48 @@

namespace Crwlr\Crawler\Steps\Html;

use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use DOMDocument;
use DOMXPath;
use Symfony\Component\DomCrawler\Crawler;

class XPathQuery extends DomQuery
{
/**
* @throws InvalidDomQueryException
*/
public function __construct(string $query)
{
$this->validateQuery($query);

parent::__construct($query);
}

public function filter(Crawler $domCrawler): Crawler
{
return $domCrawler->filterXPath($this->query);
}

/**
* @throws InvalidDomQueryException
*/
private function validateQuery(string $query): void
{
// Temporarily set a new error handler, so checking an invalid XPath query does not generate a PHP warning.
$previousHandler = set_error_handler(function ($errno, $errstr) {
if ($errno === E_WARNING && $errstr === 'DOMXPath::evaluate(): Invalid expression') {
return true;
}

return false;
});

if ((new DOMXPath(new DOMDocument()))->evaluate($query) === false) {
set_error_handler($previousHandler);

throw InvalidDomQueryException::make('Invalid XPath query', $query);
} else {
set_error_handler($previousHandler);
}
}
}
5 changes: 5 additions & 0 deletions tests/Steps/Html/CssSelectorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@
namespace tests\Steps\Html;

use Crwlr\Crawler\Steps\Html\CssSelector;
use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use Symfony\Component\DomCrawler\Crawler;

use function tests\helper_getSimpleListHtml;

it('throws an exception when created with an invalid CSS Selector', function ($selector) {
new CssSelector($selector);
})->throws(InvalidDomQueryException::class)->with(['.foo;', '.foo:before']);

test('The apply method returns a string for a single match', function () {
$html = '<div class="item">test</div>';

Expand Down
5 changes: 5 additions & 0 deletions tests/Steps/Html/XPathQueryTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@

namespace tests\Steps\Html;

use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use Crwlr\Crawler\Steps\Html\XPathQuery;
use Symfony\Component\DomCrawler\Crawler;

use function tests\helper_getSimpleListHtml;

it('throws an exception when created with an invalid XPath query', function () {
new XPathQuery('//a/@@bob/uncle');
})->throws(InvalidDomQueryException::class);

test('The apply method returns a string for a single match', function () {
$xml = '<item>test</item>';

Expand Down

0 comments on commit 69a1949

Please sign in to comment.