From 8adf16a4768f598bd9f15c2105c66fe85dd8e7a9 Mon Sep 17 00:00:00 2001 From: otsch Date: Sun, 8 Dec 2024 20:39:58 +0100 Subject: [PATCH] V3 with new DOM API and Array Step Output Filter Details see CHANGELOG.md. --- .github/workflows/ci.yml | 22 + CHANGELOG.md | 28 + composer.json | 5 +- phpstan.neon | 9 + src/Steps/BaseStep.php | 74 +-- src/Steps/Dom.php | 116 ++-- src/Steps/Dom/DomDocument.php | 20 + src/Steps/Dom/HtmlDocument.php | 55 ++ src/Steps/Dom/HtmlElement.php | 38 ++ src/Steps/Dom/Node.php | 166 +++++ src/Steps/Dom/NodeList.php | 154 +++++ src/Steps/Dom/XmlDocument.php | 41 ++ src/Steps/Dom/XmlElement.php | 33 + src/Steps/Filters/AbstractFilter.php | 83 +++ src/Steps/Filters/ArrayFilter.php | 28 + src/Steps/Filters/ClosureFilter.php | 2 +- src/Steps/Filters/ComparisonFilter.php | 2 +- src/Steps/Filters/Filter.php | 81 +-- src/Steps/Filters/Filterable.php | 82 +++ src/Steps/Filters/StringFilter.php | 2 +- src/Steps/Filters/StringLengthFilter.php | 2 +- src/Steps/Filters/UrlFilter.php | 2 +- src/Steps/Html.php | 21 +- src/Steps/Html/CssSelector.php | 26 +- src/Steps/Html/DomQuery.php | 102 +-- src/Steps/Html/DomQueryInterface.php | 20 - .../Exceptions/InvalidDomQueryException.php | 14 + src/Steps/Html/GetLink.php | 34 +- src/Steps/Html/GetLinks.php | 6 +- src/Steps/Html/MetaData.php | 24 +- src/Steps/Html/SchemaOrg.php | 7 + src/Steps/Html/SelectorTarget.php | 5 +- src/Steps/Html/XPathQuery.php | 12 +- src/Steps/Json.php | 4 +- src/Steps/Loading/Http/Document.php | 17 +- src/Steps/Loading/Http/Paginator.php | 4 +- .../Paginators/SimpleWebsitePaginator.php | 40 +- .../Paginators/StopRules/IsEmptyInDom.php | 36 +- .../Paginators/StopRules/IsEmptyInHtml.php | 11 +- .../Paginators/StopRules/IsEmptyInXml.php | 11 +- .../StopRules/PaginatorStopRules.php | 6 +- src/Steps/Loading/HttpCrawl.php | 32 +- src/Steps/Sitemap/GetUrlsFromSitemap.php | 38 +- src/Steps/Step.php | 19 +- src/Steps/Xml.php | 25 +- tests/Steps/Dom/HtmlDocumentTest.php | 88 +++ tests/Steps/Dom/HtmlElementTest.php | 180 +++++ tests/Steps/Dom/NodeListTest.php | 220 +++++++ tests/Steps/Dom/NodeTest.php | 615 ++++++++++++++++++ tests/Steps/Dom/XmlDocumentTest.php | 70 ++ tests/Steps/Dom/XmlElementTest.php | 96 +++ tests/Steps/Dom/_Stubs/HtmlNodeStub.php | 24 + tests/Steps/Dom/_Stubs/XmlNodeStub.php | 24 + tests/Steps/DomTest.php | 113 ++-- tests/Steps/Filters/ArrayFilterTest.php | 109 ++++ tests/Steps/Filters/FilterTest.php | 4 +- tests/Steps/Html/CssSelectorTest.php | 110 +--- tests/Steps/Html/XPathQueryTest.php | 95 +-- tests/Steps/HtmlTest.php | 119 ++-- tests/Steps/Loading/Http/DocumentTest.php | 9 +- .../Steps/Sitemap/GetUrlsFromSitemapTest.php | 43 +- tests/Steps/XmlTest.php | 220 ++++--- tests/_Integration/Http/CrawlingTest.php | 79 +-- .../_Integration/Http/HeadlessBrowserTest.php | 6 +- 64 files changed, 2922 insertions(+), 861 deletions(-) create mode 100644 src/Steps/Dom/DomDocument.php create mode 100644 src/Steps/Dom/HtmlDocument.php create mode 100644 src/Steps/Dom/HtmlElement.php create mode 100644 src/Steps/Dom/Node.php create mode 100644 src/Steps/Dom/NodeList.php create mode 100644 src/Steps/Dom/XmlDocument.php create mode 100644 src/Steps/Dom/XmlElement.php create mode 100644 src/Steps/Filters/AbstractFilter.php create mode 100644 src/Steps/Filters/ArrayFilter.php create mode 100644 src/Steps/Filters/Filterable.php delete mode 100644 src/Steps/Html/DomQueryInterface.php create mode 100644 tests/Steps/Dom/HtmlDocumentTest.php create mode 100644 tests/Steps/Dom/HtmlElementTest.php create mode 100644 tests/Steps/Dom/NodeListTest.php create mode 100644 tests/Steps/Dom/NodeTest.php create mode 100644 tests/Steps/Dom/XmlDocumentTest.php create mode 100644 tests/Steps/Dom/XmlElementTest.php create mode 100644 tests/Steps/Dom/_Stubs/HtmlNodeStub.php create mode 100644 tests/Steps/Dom/_Stubs/XmlNodeStub.php create mode 100644 tests/Steps/Filters/ArrayFilterTest.php diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 12e2284..2c4dd72 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,6 +28,28 @@ jobs: - name: Run integration tests run: composer test-integration + tests84: + name: PestPHP Tests Running only on PHP >= 8.4 + runs-on: ubuntu-latest + strategy: + matrix: + php-versions: [ '8.4' ] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install PHP + uses: shivammathur/setup-php@v2 + with: + php-version: ${{ matrix.php-versions }} + + - name: Install dependencies + run: composer install --prefer-dist --no-progress + + - name: Run tests + run: composer test-php84 + stanAndCs: name: Static Analysis (phpstan) and Code Style (PHP CS Fixer) runs-on: ubuntu-latest diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c283ab..3bbad9f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,34 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [3.0.0] - 2024-12-x +The primary change in version 3.0.0 is that the library now leverages PHP 8.4’s new DOM API when used in an environment with PHP >= 8.4. To maintain compatibility with PHP < 8.4, an abstraction layer has been implemented. This layer dynamically uses either the Symfony DomCrawler component or the new DOM API, depending on the PHP version. + +Since no direct interaction with an instance of the Symfony DomCrawler library was required at the step level provided by the library, it is highly likely that you won’t need to make any changes to your code to upgrade to v3. To ensure a smooth transition, please review the points under “Changed.” + +If you're using XPath queries for data extraction, please try to switch to using CSS selectors instead, because XPath is no longer supported by the new DOM API. Therefor XPath related functionality was deprecated in this version of the library and will probably be removed in the next major version. + +### Changed +* The `DomQuery::innerText()` method (a.k.a. `Dom::cssSelector('...')->innerText()`) has been removed. `innerText` exists only in the Symfony DomCrawler component, and its usefulness is questionable. If you still require this variant of the DOM element text, please let us know or create a pull request yourself. Thank you! +* The `DomQueryInterface` was removed. As the `DomQuery` class offers a lot more functionality than the interface defines, the purpose of the interface was questionable. Please use the abstract `DomQuery` class instead. This also means that some method signatures, type hinting the interface, have changed. Look for occurences of `DomQueryInterface` and replace them. +* The visibility of the `DomQuery::filter()` method was changed from public to protected. It is still needed in the `DomQuery` class, but outside of it, it is probably better and easier to directly use the new DOM abstraction (see the `src/Steps/Dom` directory). If you are extending the `DomQuery` class (which is not recommended), be aware that the argument now takes a `Node` (from the new DOM abstraction) instead of a Symfony `Crawler`. +* The `Step::validateAndSanitizeToDomCrawlerInstance()` method was removed. Please use the `Step::validateAndSanitizeToHtmlDocumentInstance()` and `Step::validateAndSanitizeToXmlDocumentInstance()` methods instead. +* The second argument in `Closure`s passed to the `Http::crawl()->customFilter()` has changed from an instance of Symfony `Crawler` class, to an `HtmlElement` instance from the new DOM abstraction (`Crwlr\Crawler\Steps\Dom\HtmlElement`). +* The second argument in `Closure`s passed to the `Http::crawl()->customFilter()` has changed from an instance of Symfony `Crawler` class, to an `HtmlElement` instance from the new DOM abstraction (`Crwlr\Crawler\Steps\Dom\HtmlElement`). +* The Filter class was split into `AbstractFilter` (base class for actual filter classes) and `Filter` only hosting the static function for easy instantiation, because otherwise each filter class also has all the static methods. +* Further, the signatures of some methods that are mainly here for internal usage, have changed due to the new DOM abstraction: + * The static `GetLink::isSpecialNonHttpLink()` method now needs an instance of `HtmlElement` instead of a Symfony `Crawler`. + * `GetUrlsFromSitemap::fixUrlSetTag()` now takes an `XmlDocument` instead of a Symfony `Crawler`. + * The `DomQuery::apply()` method now takes a `Node` instead of a Symfony `Crawler`. + +### Deprecated +* `Dom::xPath()` method and +* the `XPathQuery` class as well as +* the new `Node::queryXPath()` method. + +### Added +* New step output filter `Filter::arrayHasElement()`. When a step produces array output with a property being a numeric array, you can now filter outputs by checking if one element of that array property, matches certain filter criteria. Example: The outputs look like `['foo' => 'bar', 'baz' => ['one', 'two', 'three']]`. You can filter all outputs where `baz` contains `two` like: `Filter::arrayHasElement()->where('baz', Filter::equal('two'))`. + ## [2.1.3] - 2024-11-05 ### Fixed * Improvements for deprecations in PHP 8.4. diff --git a/composer.json b/composer.json index dca4717..54fb8fa 100644 --- a/composer.json +++ b/composer.json @@ -40,7 +40,7 @@ "guzzlehttp/guzzle": "^7.4", "adbario/php-dot-notation": "^3.1", "chrome-php/chrome": "^1.7", - "crwlr/utils": "^1.1", + "crwlr/utils": "^1.2", "crwlr/html-2-text": "^0.1.0" }, "require-dev": { @@ -74,7 +74,8 @@ } }, "scripts": { - "test": "pest --exclude-group integration --display-warnings --bail", + "test": "pest --exclude-group integration,php84 --display-warnings --bail", + "test-php84": "pest --group php84 --display-warnings --bail", "test-integration": "pest --group integration --display-warnings --bail", "stan": "@php -d memory_limit=4G vendor/bin/phpstan analyse", "cs": "php-cs-fixer fix -v --dry-run", diff --git a/phpstan.neon b/phpstan.neon index 2b7e747..896e34d 100644 --- a/phpstan.neon +++ b/phpstan.neon @@ -12,3 +12,12 @@ parameters: - "#^Access to an undefined property Spatie\\\\Invade\\\\Invader#" - "#^Call to an undefined method Spatie\\\\Invade\\\\Invader#" - "#^Call to protected method [a-zA-Z]{5,30}\\(\\) of class PHPUnit\\\\Framework\\\\TestCase.#" + - "#^(?:Parameter|Method) .+ has invalid (return )?type Dom\\\\.+\\.#" + - "#^Call to .+ on an unknown class Dom\\\\.+\\.#" + - "#^Property .+ has unknown class Dom\\\\.+ as its type\\.#" + - "#^Class Dom\\\\.+ not found.#" + - "#^Access to property .+ on an unknown class Dom\\\\.+\\.#" + - "#^PHPDoc tag .+ contains unknown class Dom\\\\.+\\.#" + - "#^Call to an undefined (static )?method Dom\\\\.+::.+\\(\\)\\.#" + - "#^Access to an undefined property Dom\\\\.+::\\$.+\\.#" + - "#^Function .+ has invalid return type Dom\\\\.+\\.#" diff --git a/src/Steps/BaseStep.php b/src/Steps/BaseStep.php index 1487540..482a411 100644 --- a/src/Steps/BaseStep.php +++ b/src/Steps/BaseStep.php @@ -10,10 +10,9 @@ use Crwlr\Crawler\Output; use Crwlr\Crawler\Result; use Crwlr\Crawler\Steps\Exceptions\PreRunValidationException; -use Crwlr\Crawler\Steps\Filters\FilterInterface; +use Crwlr\Crawler\Steps\Filters\Filterable; use Crwlr\Crawler\Steps\Refiners\RefinerInterface; use Crwlr\Crawler\Utils\OutputTypeHelper; -use Exception; use Generator; use InvalidArgumentException; use Psr\Log\LoggerInterface; @@ -24,6 +23,8 @@ abstract class BaseStep implements StepInterface { + use Filterable; + /** * true means: keep the whole output array/object * string: keep that one key from the (array/object) output @@ -69,11 +70,6 @@ abstract class BaseStep implements StepInterface */ protected array $uniqueOutputKeys = []; - /** - * @var FilterInterface[] - */ - protected array $filters = []; - /** * @var array */ @@ -183,47 +179,6 @@ public function uniqueOutputs(?string $key = null): static return $this; } - public function where(string|FilterInterface $keyOrFilter, ?FilterInterface $filter = null): static - { - if (is_string($keyOrFilter) && $filter === null) { - throw new InvalidArgumentException('You have to provide a Filter (instance of FilterInterface)'); - } elseif (is_string($keyOrFilter)) { - if ($this->isOutputKeyAlias($keyOrFilter)) { - $keyOrFilter = $this->getOutputKeyAliasRealKey($keyOrFilter); - } - - $filter->useKey($keyOrFilter); - - $this->filters[] = $filter; - } else { - $this->filters[] = $keyOrFilter; - } - - return $this; - } - - /** - * @throws Exception - */ - public function orWhere(string|FilterInterface $keyOrFilter, ?FilterInterface $filter = null): static - { - if (empty($this->filters)) { - throw new Exception('No where before orWhere'); - } elseif (is_string($keyOrFilter) && $filter === null) { - throw new InvalidArgumentException('You have to provide a Filter (instance of FilterInterface)'); - } elseif (is_string($keyOrFilter)) { - $filter->useKey($keyOrFilter); - } else { - $filter = $keyOrFilter; - } - - $lastFilter = end($this->filters); - - $lastFilter->addOr($filter); - - return $this; - } - public function refineOutput( string|Closure|RefinerInterface $keyOrRefiner, null|Closure|RefinerInterface $refiner = null, @@ -539,29 +494,6 @@ protected function inputOrOutputIsUnique(Io $io): bool return true; } - protected function passesAllFilters(mixed $output): bool - { - foreach ($this->filters as $filter) { - if (!$filter->evaluate($output)) { - if ($filter->getOr()) { - $orFilter = $filter->getOr(); - - while ($orFilter) { - if ($orFilter->evaluate($output)) { - continue 2; - } - - $orFilter = $orFilter->getOr(); - } - } - - return false; - } - } - - return true; - } - protected function applyRefiners(mixed $outputValue, mixed $inputValue): mixed { foreach ($this->refiners as $refiner) { diff --git a/src/Steps/Dom.php b/src/Steps/Dom.php index 9dc647e..48d2883 100644 --- a/src/Steps/Dom.php +++ b/src/Steps/Dom.php @@ -2,41 +2,46 @@ namespace Crwlr\Crawler\Steps; +use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException; use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest; +use Crwlr\Crawler\Steps\Dom\DomDocument; +use Crwlr\Crawler\Steps\Dom\HtmlDocument; +use Crwlr\Crawler\Steps\Dom\Node; +use Crwlr\Crawler\Steps\Dom\NodeList; +use Crwlr\Crawler\Steps\Dom\XmlDocument; use Crwlr\Crawler\Steps\Html\CssSelector; -use Crwlr\Crawler\Steps\Html\DomQueryInterface; +use Crwlr\Crawler\Steps\Html\DomQuery; use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException; use Crwlr\Crawler\Steps\Html\XPathQuery; +use Crwlr\Html2Text\Exceptions\InvalidHtmlException; use Exception; use Generator; use InvalidArgumentException; -use Symfony\Component\DomCrawler\Crawler; abstract class Dom extends Step { protected bool $root = false; - protected ?DomQueryInterface $each = null; + protected ?DomQuery $each = null; - protected ?DomQueryInterface $first = null; + protected ?DomQuery $first = null; - protected ?DomQueryInterface $last = null; + protected ?DomQuery $last = null; /** - * @var array + * @var array */ protected array $mapping = []; - protected null|string|DomQueryInterface $singleSelector = null; + protected null|string|DomQuery $singleSelector = null; protected ?string $baseUrl = null; /** - * @param string|DomQueryInterface|array $selectorOrMapping + * @param string|DomQuery|array $selectorOrMapping */ - final public function __construct( - string|DomQueryInterface|array $selectorOrMapping = [], - ) { + final public function __construct(string|DomQuery|array $selectorOrMapping = []) + { $this->extract($selectorOrMapping); } @@ -49,7 +54,7 @@ public static function root(): static return $instance; } - public static function each(string|DomQueryInterface $domQuery): static + public static function each(string|DomQuery $domQuery): static { $instance = new static(); @@ -58,7 +63,7 @@ public static function each(string|DomQueryInterface $domQuery): static return $instance; } - public static function first(string|DomQueryInterface $domQuery): static + public static function first(string|DomQuery $domQuery): static { $instance = new static(); @@ -67,7 +72,7 @@ public static function first(string|DomQueryInterface $domQuery): static return $instance; } - public static function last(string|DomQueryInterface $domQuery): static + public static function last(string|DomQuery $domQuery): static { $instance = new static(); @@ -86,18 +91,20 @@ public static function cssSelector(string $selector): CssSelector /** * @throws InvalidDomQueryException + * @deprecated As the usage of XPath queries is no longer an option with the new DOM API introduced in + * PHP 8.4, please switch to using CSS selectors instead! */ public static function xPath(string $query): XPathQuery { return new XPathQuery($query); } - abstract protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface; + abstract protected function makeDefaultDomQueryInstance(string $query): DomQuery; /** - * @param string|DomQueryInterface|array $selectorOrMapping + * @param string|DomQuery|array $selectorOrMapping */ - public function extract(string|DomQueryInterface|array $selectorOrMapping): static + public function extract(string|DomQuery|array $selectorOrMapping): static { if (is_array($selectorOrMapping)) { $this->mapping = $selectorOrMapping; @@ -108,18 +115,6 @@ public function extract(string|DomQueryInterface|array $selectorOrMapping): stat return $this; } - /** - * @throws InvalidArgumentException - */ - protected function validateAndSanitizeInput(mixed $input): Crawler - { - if ($input instanceof RespondedRequest) { - $this->baseUrl = $input->effectiveUri(); - } - - return new Crawler($this->validateAndSanitizeStringOrHttpResponse($input)); - } - public function outputType(): StepOutputType { return empty($this->mapping) && $this->singleSelector ? @@ -128,14 +123,14 @@ public function outputType(): StepOutputType } /** - * @param Crawler $input + * @param HtmlDocument|Node $input * @throws Exception */ protected function invoke(mixed $input): Generator { $base = $this->getBase($input); - if ($base->count() === 0) { + if (!$base || ($base instanceof NodeList && $base->count() === 0)) { return; } @@ -143,16 +138,35 @@ protected function invoke(mixed $input): Generator yield from $this->singleSelector($base); } else { if ($this->each) { - foreach ($base as $element) { - yield $this->mapProperties(new Crawler($element)); + if ($base instanceof NodeList) { + foreach ($base as $element) { + yield $this->mapProperties($element); + } } - } else { + } elseif ($base instanceof Node) { yield $this->mapProperties($base); } } } - protected function singleSelector(Crawler $domCrawler): Generator + + /** + * @throws InvalidArgumentException|MissingZlibExtensionException + */ + protected function validateAndSanitizeInput(mixed $input): HtmlDocument|XmlDocument + { + if ($input instanceof RespondedRequest) { + $this->baseUrl = $input->effectiveUri(); + } + + return new HtmlDocument($this->validateAndSanitizeStringOrHttpResponse($input)); + } + + /** + * @throws InvalidHtmlException + * @throws Exception + */ + protected function singleSelector(Node|NodeList $nodeOrNodeList): Generator { if ($this->singleSelector === null) { return; @@ -166,7 +180,15 @@ protected function singleSelector(Crawler $domCrawler): Generator $domQuery->setBaseUrl($this->baseUrl); } - $outputs = $domQuery->apply($domCrawler); + if ($nodeOrNodeList instanceof NodeList) { + $outputs = []; + + foreach ($nodeOrNodeList as $node) { + $outputs[] = $domQuery->apply($node); + } + } else { + $outputs = $domQuery->apply($nodeOrNodeList); + } if (is_array($outputs)) { foreach ($outputs as $output) { @@ -181,7 +203,7 @@ protected function singleSelector(Crawler $domCrawler): Generator * @return mixed[] * @throws Exception */ - protected function mapProperties(Crawler $domCrawler): array + protected function mapProperties(Node $node): array { $mappedProperties = []; @@ -189,7 +211,7 @@ protected function mapProperties(Crawler $domCrawler): array if ($domQuery instanceof Dom) { $domQuery->baseUrl = $this->baseUrl; - $mappedProperties[$key] = iterator_to_array($domQuery->invoke($domCrawler)); + $mappedProperties[$key] = iterator_to_array($domQuery->invoke($node)); } else { if (is_string($domQuery)) { $domQuery = $this->makeDefaultDomQueryInstance($domQuery); @@ -199,7 +221,7 @@ protected function mapProperties(Crawler $domCrawler): array $domQuery->setBaseUrl($this->baseUrl); } - $mappedProperties[$key] = $domQuery->apply($domCrawler); + $mappedProperties[$key] = $domQuery->apply($node); } } @@ -209,16 +231,22 @@ protected function mapProperties(Crawler $domCrawler): array /** * @throws Exception */ - protected function getBase(Crawler $domCrawler): Crawler + protected function getBase(DomDocument|Node $document): null|Node|NodeList { if ($this->root) { - return $domCrawler; + return $document; } elseif ($this->each) { - return $this->each->filter($domCrawler); + return $this->each instanceof CssSelector ? + $document->querySelectorAll($this->each->query) : + $document->queryXPath($this->each->query); } elseif ($this->first) { - return $this->first->filter($domCrawler)->first(); + return $this->first instanceof CssSelector ? + $document->querySelector($this->first->query) : + $document->queryXPath($this->first->query)->first(); } elseif ($this->last) { - return $this->last->filter($domCrawler)->last(); + return $this->last instanceof CssSelector ? + $document->querySelectorAll($this->last->query)->last() : + $document->queryXPath($this->last->query)->last(); } throw new Exception('Invalid state: no base selector'); diff --git a/src/Steps/Dom/DomDocument.php b/src/Steps/Dom/DomDocument.php new file mode 100644 index 0000000..3af08f2 --- /dev/null +++ b/src/Steps/Dom/DomDocument.php @@ -0,0 +1,20 @@ +makeDocumentInstance($source)); // @phpstan-ignore-line + } + + /** + * @param string $source + * @return Document|Crawler + */ + abstract protected function makeDocumentInstance(string $source): object; +} diff --git a/src/Steps/Dom/HtmlDocument.php b/src/Steps/Dom/HtmlDocument.php new file mode 100644 index 0000000..36e1c5b --- /dev/null +++ b/src/Steps/Dom/HtmlDocument.php @@ -0,0 +1,55 @@ + querySelectorAll(string $selector) + * @method NodeList queryXPath(string $selector) + */ + +class HtmlDocument extends DomDocument +{ + /** + * Gets the href attribute of a tag in the document + * + * In case there are multiple base elements in the document: + * https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base + * "If multiple elements are used, only the first href and first target are obeyed..." + */ + public function getBaseHref(): ?string + { + $baseTag = $this->querySelector('base'); + + return $baseTag?->getAttribute('href'); + } + + public function outerHtml(): string + { + return $this->outerSource(); + } + + /** + * @param \Dom\Node|DOMNode|Crawler $node + */ + protected function makeChildNodeInstance(object $node): Node + { + return new HtmlElement($node); + } + + /** + * @return \Dom\HTMLDocument|Crawler + */ + protected function makeDocumentInstance(string $source): object + { + if (PhpVersion::isAtLeast(8, 4)) { + return \Dom\HTMLDocument::createFromString($source, LIBXML_NOERROR); + } + + return new Crawler($source); + } +} diff --git a/src/Steps/Dom/HtmlElement.php b/src/Steps/Dom/HtmlElement.php new file mode 100644 index 0000000..0d8f1f7 --- /dev/null +++ b/src/Steps/Dom/HtmlElement.php @@ -0,0 +1,38 @@ + querySelectorAll(string $selector) + * @method NodeList queryXPath(string $selector) + */ + +class HtmlElement extends Node +{ + public function outerHtml(): string + { + return $this->outerSource(); + } + + public function innerHtml(): string + { + return $this->innerSource(); + } + + public function html(): string + { + return $this->innerHtml(); + } + + /** + * @param \Dom\Node|DOMNode|Crawler $node + */ + protected function makeChildNodeInstance(object $node): Node + { + return new HtmlElement($node); + } +} diff --git a/src/Steps/Dom/Node.php b/src/Steps/Dom/Node.php new file mode 100644 index 0000000..db2432c --- /dev/null +++ b/src/Steps/Dom/Node.php @@ -0,0 +1,166 @@ +node = $node; + } + + public function querySelector(string $selector): ?Node + { + if ($this->node instanceof Crawler) { + $filtered = $this->node->filter($selector); + + return $filtered->count() > 0 ? $this->makeChildNodeInstance($filtered->first()) : null; + } + + $result = $this->node->querySelector($selector); + + return $result !== null ? $this->makeChildNodeInstance($result) : null; + } + + public function querySelectorAll(string $selector): NodeList + { + if ($this->node instanceof Crawler) { + return $this->makeNodeListInstance($this->node->filter($selector)); + } + + return $this->makeNodeListInstance($this->node->querySelectorAll($selector)); + } + + /** + * @deprecated As the usage of XPath queries is no longer an option with the new DOM API introduced in + * PHP 8.4, please switch to using CSS selectors instead! + */ + public function queryXPath(string $query): NodeList + { + $node = $this->node; + + if (!$node instanceof Crawler) { + $node = new Crawler($this->outerSource()); + } + + return $this->makeNodeListInstance($node->filterXPath($query)); + } + + public function nodeName(): string + { + if ($this->node instanceof Crawler) { + $nodeName = $this->node->nodeName(); + } else { + $nodeName = $this->node->nodeName ?? ''; + } + + return strtolower($nodeName); + } + + public function text(): string + { + if ($this->node instanceof Crawler) { + $text = $this->node->text(); + } else { + $text = $this->node->textContent ?? ''; + } + + return trim( + preg_replace("/(?:[ \n\r\t\x0C]{2,}+|[\n\r\t\x0C])/", ' ', $text), + " \n\r\t\x0C", + ); + } + + public function getAttribute(string $attributeName): ?string + { + if ($this->node instanceof Crawler) { + return $this->node->attr($attributeName); + } + + return $this->node->getAttribute($attributeName); + } + + /** + * @param \Dom\Node|DOMNode|Crawler $node + */ + abstract protected function makeChildNodeInstance(object $node): Node; + + protected function outerSource(): string + { + if ($this->node instanceof Crawler) { + return $this->node->outerHtml(); + } + + $parentDocument = $this->getParentDocumentOfNode($this->node); + + if ($parentDocument instanceof \Dom\HTMLDocument) { + return $parentDocument->saveHTML($this->node); + } elseif ($parentDocument instanceof \Dom\XMLDocument) { + return $parentDocument->saveXML($this->node); + } + + return $this->node->innerHTML; + } + + protected function innerSource(): string + { + if ($this->node instanceof Crawler) { + return $this->node->html(); + } + + return $this->node->innerHTML; + } + + /** + * @param \Dom\NodeList|Crawler $nodeList + */ + protected function makeNodeListInstance(object $nodeList): NodeList + { + return new NodeList( + $nodeList, + function (object $node): Node { + /** @var DOMNode|\Dom\Node $node */ + return $this->makeChildNodeInstance($node); + }, + ); + } + + /** + * @param \Dom\Node $node + * @return Document|null + */ + private function getParentDocumentOfNode(object $node): ?object + { + if ($node instanceof Document) { + return $node; + } + + $parentDocument = $node->parentNode; + + while ($parentDocument && !$parentDocument instanceof Document) { + $parentDocument = $parentDocument->parentNode; + } + + if ($parentDocument instanceof Document) { + return $parentDocument; + } + + return null; + } +} diff --git a/src/Steps/Dom/NodeList.php b/src/Steps/Dom/NodeList.php new file mode 100644 index 0000000..4537e20 --- /dev/null +++ b/src/Steps/Dom/NodeList.php @@ -0,0 +1,154 @@ + + */ + +class NodeList implements IteratorAggregate, Countable +{ + /** + * @param \Dom\NodeList|Crawler|array $nodeList + */ + public function __construct( + private readonly object|array $nodeList, + private readonly ?Closure $makeNodeInstance = null, + ) {} + + /** + * @throws Exception + */ + public function first(): ?Node + { + $iterator = $this->getIterator(); + + $iterator->rewind(); + + return $iterator->current(); + } + + /** + * @throws Exception + */ + public function last(): ?Node + { + $iterator = $this->getIterator(); + + foreach ($iterator as $node) { + } + + return $node ?? null; + } + + /** + * @throws Exception + */ + public function nth(int $index): ?Node + { + $iterator = $this->getIterator(); + + $i = 0; + + foreach ($iterator as $node) { + if (($i + 1) === $index) { + return $node; + } + + $i++; + } + + return null; + } + + /** + * @return mixed[] + * @throws Exception + */ + public function each(Closure $callback): array + { + $data = []; + + foreach ($this->getIterator() as $key => $node) { + $data[] = $callback($node, $key); + } + + return $data; + } + + public function count(): int + { + if (is_array($this->nodeList)) { + return count($this->nodeList); + } + + return $this->nodeList->count(); + } + + public function getIterator(): Iterator + { + if (is_array($this->nodeList)) { + return new ArrayIterator($this->nodeList); + } + + $iterator = $this->nodeList->getIterator(); + + /** @var Iterator $iterator */ + + return new class ($iterator, $this->makeNodeInstance) implements Iterator { + /** + * @param Iterator $iterator + */ + public function __construct( + private readonly Iterator $iterator, + private readonly ?Closure $makeNodeInstanceCallback = null, + ) {} + + public function current(): ?Node + { + return $this->makeNodeInstance($this->iterator->current()); + } + + public function next(): void + { + $this->iterator->next(); + } + + public function key(): mixed + { + return $this->iterator->key(); + } + + public function valid(): bool + { + return $this->iterator->valid(); + } + + public function rewind(): void + { + $this->iterator->rewind(); + } + + /** + * @param \Dom\Node|DOMNode|Crawler $node + */ + private function makeNodeInstance(mixed $node): ?Node + { + if (!is_object($node)) { + return null; + } + + return $this->makeNodeInstanceCallback?->__invoke($node) ?? null; + } + }; + } +} diff --git a/src/Steps/Dom/XmlDocument.php b/src/Steps/Dom/XmlDocument.php new file mode 100644 index 0000000..dd84673 --- /dev/null +++ b/src/Steps/Dom/XmlDocument.php @@ -0,0 +1,41 @@ + querySelectorAll(string $selector) + * @method NodeList queryXPath(string $selector) + */ + +class XmlDocument extends DomDocument +{ + public function outerXml(): string + { + return $this->outerSource(); + } + + /** + * @param \Dom\Node|DOMNode|Crawler $node + */ + protected function makeChildNodeInstance(object $node): Node + { + return new XmlElement($node); + } + + /** + * @return \Dom\XMLDocument|Crawler + */ + protected function makeDocumentInstance(string $source): object + { + if (PhpVersion::isAtLeast(8, 4)) { + return \Dom\XMLDocument::createFromString($source, LIBXML_NOERROR); + } + + return new Crawler($source); + } +} diff --git a/src/Steps/Dom/XmlElement.php b/src/Steps/Dom/XmlElement.php new file mode 100644 index 0000000..5c0d496 --- /dev/null +++ b/src/Steps/Dom/XmlElement.php @@ -0,0 +1,33 @@ + querySelectorAll(string $selector) + * @method NodeList queryXPath(string $selector) + */ + +class XmlElement extends Node +{ + public function outerXml(): string + { + return $this->outerSource(); + } + + public function innerXml(): string + { + return $this->innerSource(); + } + + /** + * @param \Dom\Node|DOMNode|Crawler $node + */ + protected function makeChildNodeInstance(object $node): Node + { + return new XmlElement($node); + } +} diff --git a/src/Steps/Filters/AbstractFilter.php b/src/Steps/Filters/AbstractFilter.php new file mode 100644 index 0000000..b59a156 --- /dev/null +++ b/src/Steps/Filters/AbstractFilter.php @@ -0,0 +1,83 @@ +useKey = $key; + + return $this; + } + + /** + * Step::orWhere() uses this method to link further Filters with OR to this filter. + * The Step then takes care of checking if one of the ORs evaluates to true. + */ + public function addOr(FilterInterface $filter): void + { + if ($this->or instanceof FilterInterface) { + $or = $this->or; + + while ($or->getOr()) { + $or = $or->getOr(); + } + + $or->addOr($filter); + } else { + $this->or = $filter; + } + } + + /** + * Get the Filter linked to this Filter as OR. + */ + public function getOr(): ?FilterInterface + { + return $this->or instanceof FilterInterface ? $this->or : null; + } + + public function negate(): NegatedFilter + { + return new NegatedFilter($this); + } + + /** + * @throws Exception + */ + protected function getKey(mixed $value): mixed + { + if ($this->useKey === null) { + return $value; + } + + if (!is_array($value) && !is_object($value)) { + throw new InvalidArgumentException('Can only filter by key with array or object output.'); + } + + if (is_object($value) && !property_exists($value, $this->useKey) && method_exists($value, '__serialize')) { + $serialized = $value->__serialize(); + + if (array_key_exists($this->useKey, $serialized)) { + $value = $serialized; + } + } + + if ( + (is_array($value) && !array_key_exists($this->useKey, $value)) || + (is_object($value) && !property_exists($value, $this->useKey)) + ) { + throw new Exception('Key to filter by does not exist in output.'); + } + + return is_array($value) ? $value[$this->useKey] : $value->{$this->useKey}; + } +} diff --git a/src/Steps/Filters/ArrayFilter.php b/src/Steps/Filters/ArrayFilter.php new file mode 100644 index 0000000..719df73 --- /dev/null +++ b/src/Steps/Filters/ArrayFilter.php @@ -0,0 +1,28 @@ +getKey($valueInQuestion); + + if (is_array($valueInQuestion) && !empty($valueInQuestion)) { + foreach ($valueInQuestion as $value) { + if ($this->passesAllFilters($value)) { + return true; + } + } + } + + return false; + } +} diff --git a/src/Steps/Filters/ClosureFilter.php b/src/Steps/Filters/ClosureFilter.php index 6f65d2b..3979dde 100644 --- a/src/Steps/Filters/ClosureFilter.php +++ b/src/Steps/Filters/ClosureFilter.php @@ -5,7 +5,7 @@ use Closure; use Exception; -class ClosureFilter extends Filter +class ClosureFilter extends AbstractFilter { public function __construct( protected readonly Closure $closure, diff --git a/src/Steps/Filters/ComparisonFilter.php b/src/Steps/Filters/ComparisonFilter.php index e146218..e149f02 100644 --- a/src/Steps/Filters/ComparisonFilter.php +++ b/src/Steps/Filters/ComparisonFilter.php @@ -5,7 +5,7 @@ use Crwlr\Crawler\Steps\Filters\Enums\ComparisonFilterRule; use Exception; -class ComparisonFilter extends Filter +class ComparisonFilter extends AbstractFilter { public function __construct( protected readonly ComparisonFilterRule $filterRule, diff --git a/src/Steps/Filters/Filter.php b/src/Steps/Filters/Filter.php index e8d660d..2e4821d 100644 --- a/src/Steps/Filters/Filter.php +++ b/src/Steps/Filters/Filter.php @@ -7,15 +7,9 @@ use Crwlr\Crawler\Steps\Filters\Enums\StringFilterRule; use Crwlr\Crawler\Steps\Filters\Enums\StringLengthFilterRule; use Crwlr\Crawler\Steps\Filters\Enums\UrlFilterRule; -use Exception; -use InvalidArgumentException; -abstract class Filter implements FilterInterface +abstract class Filter { - protected ?string $useKey = null; - - protected bool|FilterInterface $or = false; - public static function equal(mixed $equalToValue): ComparisonFilter { return new ComparisonFilter(ComparisonFilterRule::Equal, $equalToValue); @@ -121,78 +115,13 @@ public static function urlPathMatches(string $urlPathMatchesValue): UrlFilter return new UrlFilter(UrlFilterRule::PathMatches, $urlPathMatchesValue); } - public static function custom(Closure $closure): ClosureFilter - { - return new ClosureFilter($closure); - } - - public function useKey(string $key): static - { - $this->useKey = $key; - - return $this; - } - - /** - * Step::orWhere() uses this method to link further Filters with OR to this filter. - * The Step then takes care of checking if one of the ORs evaluates to true. - */ - public function addOr(FilterInterface $filter): void - { - if ($this->or instanceof FilterInterface) { - $or = $this->or; - - while ($or->getOr()) { - $or = $or->getOr(); - } - - $or->addOr($filter); - } else { - $this->or = $filter; - } - } - - /** - * Get the Filter linked to this Filter as OR. - */ - public function getOr(): ?FilterInterface + public static function arrayHasElement(): ArrayFilter { - return $this->or instanceof FilterInterface ? $this->or : null; + return new ArrayFilter(); } - public function negate(): NegatedFilter - { - return new NegatedFilter($this); - } - - /** - * @throws Exception - */ - protected function getKey(mixed $value): mixed + public static function custom(Closure $closure): ClosureFilter { - if ($this->useKey === null) { - return $value; - } - - if (!is_array($value) && !is_object($value)) { - throw new InvalidArgumentException('Can only filter by key with array or object output.'); - } - - if (is_object($value) && !property_exists($value, $this->useKey) && method_exists($value, '__serialize')) { - $serialized = $value->__serialize(); - - if (array_key_exists($this->useKey, $serialized)) { - $value = $serialized; - } - } - - if ( - (is_array($value) && !array_key_exists($this->useKey, $value)) || - (is_object($value) && !property_exists($value, $this->useKey)) - ) { - throw new Exception('Key to filter by does not exist in output.'); - } - - return is_array($value) ? $value[$this->useKey] : $value->{$this->useKey}; + return new ClosureFilter($closure); } } diff --git a/src/Steps/Filters/Filterable.php b/src/Steps/Filters/Filterable.php new file mode 100644 index 0000000..39c7a53 --- /dev/null +++ b/src/Steps/Filters/Filterable.php @@ -0,0 +1,82 @@ +isOutputKeyAlias($keyOrFilter) + ) { + $keyOrFilter = $this->getOutputKeyAliasRealKey($keyOrFilter); + } + + $filter->useKey($keyOrFilter); + + $this->filters[] = $filter; + } else { + $this->filters[] = $keyOrFilter; + } + + return $this; + } + + /** + * @throws Exception + */ + public function orWhere(string|FilterInterface $keyOrFilter, ?FilterInterface $filter = null): static + { + if (empty($this->filters)) { + throw new Exception('No where before orWhere'); + } elseif (is_string($keyOrFilter) && $filter === null) { + throw new InvalidArgumentException('You have to provide a Filter (instance of FilterInterface)'); + } elseif (is_string($keyOrFilter)) { + $filter->useKey($keyOrFilter); + } else { + $filter = $keyOrFilter; + } + + $lastFilter = end($this->filters); + + $lastFilter->addOr($filter); + + return $this; + } + + protected function passesAllFilters(mixed $output): bool + { + foreach ($this->filters as $filter) { + if (!$filter->evaluate($output)) { + if ($filter->getOr()) { + $orFilter = $filter->getOr(); + + while ($orFilter) { + if ($orFilter->evaluate($output)) { + continue 2; + } + + $orFilter = $orFilter->getOr(); + } + } + + return false; + } + } + + return true; + } +} diff --git a/src/Steps/Filters/StringFilter.php b/src/Steps/Filters/StringFilter.php index cfbd31e..17f0b24 100644 --- a/src/Steps/Filters/StringFilter.php +++ b/src/Steps/Filters/StringFilter.php @@ -5,7 +5,7 @@ use Crwlr\Crawler\Steps\Filters\Enums\StringFilterRule; use Exception; -class StringFilter extends Filter +class StringFilter extends AbstractFilter { public function __construct( protected readonly StringFilterRule $filterRule, diff --git a/src/Steps/Filters/StringLengthFilter.php b/src/Steps/Filters/StringLengthFilter.php index 90695fa..16c5a67 100644 --- a/src/Steps/Filters/StringLengthFilter.php +++ b/src/Steps/Filters/StringLengthFilter.php @@ -5,7 +5,7 @@ use Crwlr\Crawler\Steps\Filters\Enums\StringLengthFilterRule; use Exception; -class StringLengthFilter extends Filter +class StringLengthFilter extends AbstractFilter { public function __construct( protected readonly StringLengthFilterRule $filterRule, diff --git a/src/Steps/Filters/UrlFilter.php b/src/Steps/Filters/UrlFilter.php index c608e4e..e97c8d3 100644 --- a/src/Steps/Filters/UrlFilter.php +++ b/src/Steps/Filters/UrlFilter.php @@ -5,7 +5,7 @@ use Crwlr\Crawler\Steps\Filters\Enums\UrlFilterRule; use Exception; -class UrlFilter extends Filter +class UrlFilter extends AbstractFilter { public function __construct(protected readonly UrlFilterRule $filterRule, protected readonly string $filterString) {} diff --git a/src/Steps/Html.php b/src/Steps/Html.php index d46f31a..9826314 100644 --- a/src/Steps/Html.php +++ b/src/Steps/Html.php @@ -2,8 +2,11 @@ namespace Crwlr\Crawler\Steps; +use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException; +use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest; +use Crwlr\Crawler\Steps\Dom\HtmlDocument; use Crwlr\Crawler\Steps\Html\CssSelector; -use Crwlr\Crawler\Steps\Html\DomQueryInterface; +use Crwlr\Crawler\Steps\Html\DomQuery; use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException; use Crwlr\Crawler\Steps\Html\GetLink; use Crwlr\Crawler\Steps\Html\GetLinks; @@ -38,10 +41,24 @@ public static function schemaOrg(): SchemaOrg return new SchemaOrg(); } + /** + * @param mixed $input + * @return HtmlDocument + * @throws MissingZlibExtensionException + */ + protected function validateAndSanitizeInput(mixed $input): HtmlDocument + { + if ($input instanceof RespondedRequest) { + $this->baseUrl = $input->effectiveUri(); + } + + return $this->validateAndSanitizeToHtmlDocumentInstance($input); + } + /** * @throws InvalidDomQueryException */ - protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface + protected function makeDefaultDomQueryInstance(string $query): DomQuery { return new CssSelector($query); } diff --git a/src/Steps/Html/CssSelector.php b/src/Steps/Html/CssSelector.php index 65b2db2..9d687d3 100644 --- a/src/Steps/Html/CssSelector.php +++ b/src/Steps/Html/CssSelector.php @@ -2,11 +2,15 @@ namespace Crwlr\Crawler\Steps\Html; +use Crwlr\Crawler\Steps\Dom\HtmlDocument; +use Crwlr\Crawler\Steps\Dom\Node; +use Crwlr\Crawler\Steps\Dom\NodeList; use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException; +use Crwlr\Utils\PhpVersion; +use DOMException; use Symfony\Component\CssSelector\CssSelectorConverter; use Symfony\Component\CssSelector\Exception\ExpressionErrorException; use Symfony\Component\CssSelector\Exception\SyntaxErrorException; -use Symfony\Component\DomCrawler\Crawler; final class CssSelector extends DomQuery { @@ -15,17 +19,25 @@ final class CssSelector extends DomQuery */ public function __construct(string $query) { - try { - (new CssSelectorConverter())->toXPath($query); - } catch (ExpressionErrorException|SyntaxErrorException $exception) { - throw InvalidDomQueryException::fromSymfonyException($query, $exception); + if (PhpVersion::isBelow(8, 4)) { + try { + (new CssSelectorConverter())->toXPath($query); + } catch (ExpressionErrorException|SyntaxErrorException $exception) { + throw InvalidDomQueryException::fromSymfonyException($query, $exception); + } + } else { + try { + (new HtmlDocument('

'))->querySelector($query); + } catch (DOMException $exception) { + throw InvalidDomQueryException::fromDomException($query, $exception); + } } parent::__construct($query); } - public function filter(Crawler $domCrawler): Crawler + protected function filter(Node $node): NodeList { - return $domCrawler->filter($this->query); + return $node->querySelectorAll($this->query); } } diff --git a/src/Steps/Html/DomQuery.php b/src/Steps/Html/DomQuery.php index 78417ef..be82d3f 100644 --- a/src/Steps/Html/DomQuery.php +++ b/src/Steps/Html/DomQuery.php @@ -2,14 +2,18 @@ namespace Crwlr\Crawler\Steps\Html; +use Crwlr\Crawler\Steps\Dom\HtmlDocument; +use Crwlr\Crawler\Steps\Dom\HtmlElement; +use Crwlr\Crawler\Steps\Dom\Node; +use Crwlr\Crawler\Steps\Dom\NodeList; +use Crwlr\Crawler\Steps\Dom\XmlElement; use Crwlr\Html2Text\Exceptions\InvalidHtmlException; use Crwlr\Html2Text\Html2Text; use Crwlr\Url\Url; use Exception; use InvalidArgumentException; -use Symfony\Component\DomCrawler\Crawler; -abstract class DomQuery implements DomQueryInterface +abstract class DomQuery { public ?string $attributeName = null; @@ -37,42 +41,21 @@ public function __construct( public readonly string $query, ) {} - /** - * When there is a tag with a href attribute in an HTML document all links in the document must be resolved - * against that base url. This method finds the base href in a document if there is one. - */ - public static function getBaseHrefFromDocument(Crawler $document): ?string - { - $baseTag = $document->filter('base'); - - if ($baseTag->count() > 0) { - // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base - // "If multiple elements are used, only the first href and first target are obeyed..." - $href = $baseTag->first()->attr('href'); - - if (!empty($href)) { - return $href; - } - } - - return null; - } - /** * @return string[]|string|null * @throws InvalidHtmlException|Exception */ - public function apply(Crawler $domCrawler): array|string|null + public function apply(Node $node): array|string|null { - if ($this->toAbsoluteUrl) { - $baseHref = self::getBaseHrefFromDocument($domCrawler); + if ($this->toAbsoluteUrl && $node instanceof HtmlDocument) { + $baseHref = $node->getBaseHref(); if ($baseHref) { $this->setBaseUrl($baseHref); } } - $filtered = $this->filter($domCrawler); + $filtered = $this->filter($node); if ($this->filtersMatches()) { $filtered = $this->filterMatches($filtered); @@ -87,7 +70,11 @@ public function apply(Crawler $domCrawler): array|string|null return $this->getTarget($element); }); } elseif ($filtered->count() === 1) { - return $this->getTarget($filtered); + $node = $filtered->first(); + + if ($node instanceof HtmlElement || $node instanceof XmlElement) { + return $this->getTarget($node); + } } return null; @@ -157,13 +144,6 @@ public function html(): self return $this; } - public function innerText(): self - { - $this->target = SelectorTarget::InnerText; - - return $this; - } - public function attribute(string $attributeName): self { $this->target = SelectorTarget::Attribute; @@ -226,6 +206,8 @@ public function setBaseUrl(string $baseUrl): static return $this; } + abstract protected function filter(Node $node): NodeList; + protected function filtersMatches(): bool { return $this->onlyFirstMatch || @@ -235,31 +217,45 @@ protected function filtersMatches(): bool $this->onlyOddMatches; } - protected function filterMatches(Crawler $domCrawler): ?Crawler + /** + * @return NodeList|null + * @throws Exception + */ + protected function filterMatches(NodeList $matches): NodeList|null { if ( - $domCrawler->count() === 0 || - ($this->onlyNthMatch !== false && $domCrawler->count() < $this->onlyNthMatch) + $matches->count() === 0 || + ($this->onlyNthMatch !== false && $matches->count() < $this->onlyNthMatch) ) { return null; } if ($this->onlyFirstMatch) { - return $domCrawler->first(); + $node = $matches->first(); + + return $node ? new NodeList([$node]) : new NodeList([]); } elseif ($this->onlyLastMatch) { - return $domCrawler->last(); + $node = $matches->last(); + + return $node ? new NodeList([$node]) : new NodeList([]); } elseif ($this->onlyNthMatch !== false) { - return new Crawler($domCrawler->getNode($this->onlyNthMatch - 1)); + $node = $matches->nth($this->onlyNthMatch); + + return $node ? new NodeList([$node]) : new NodeList([]); } elseif ($this->onlyEvenMatches || $this->onlyOddMatches) { - return $this->filterEvenOrOdd($domCrawler); + return $this->filterEvenOrOdd($matches); } return null; } - protected function filterEvenOrOdd(Crawler $domCrawler): Crawler + /** + * @param NodeList $domCrawler + * @return NodeList + */ + protected function filterEvenOrOdd(NodeList $domCrawler): NodeList { - $newDomCrawler = new Crawler(); + $nodes = []; $i = 1; @@ -268,32 +264,38 @@ protected function filterEvenOrOdd(Crawler $domCrawler): Crawler ($this->onlyEvenMatches && $i % 2 === 0) || ($this->onlyOddMatches && $i % 2 !== 0) ) { - $newDomCrawler->addNode($node); + $nodes[] = $node; } $i++; } - return $newDomCrawler; + return new NodeList($nodes); } /** * @throws InvalidHtmlException * @throws Exception */ - protected function getTarget(Crawler $filtered): string + protected function getTarget(HtmlElement|XmlElement $node): string { if ($this->target === SelectorTarget::FormattedText) { if (!$this->html2TextConverter) { $this->html2TextConverter = new Html2Text(); } - $target = $this->html2TextConverter->convertHtmlToText($filtered->outerHtml()); + $target = $this->html2TextConverter->convertHtmlToText( + $node instanceof HtmlElement ? $node->outerHtml() : $node->outerXml(), + ); + } elseif ($this->target === SelectorTarget::Html) { + $target = $node instanceof HtmlElement ? trim($node->innerHtml()) : trim($node->innerXml()); + } elseif ($this->target === SelectorTarget::OuterHtml) { + $target = $node instanceof HtmlElement ? trim($node->outerHtml()) : trim($node->outerXml()); } else { $target = trim( $this->attributeName ? - $filtered->attr($this->attributeName) : - $filtered->{strtolower($this->target->name)}(), + $node->getAttribute($this->attributeName) : + $node->{strtolower($this->target->name)}(), ); } diff --git a/src/Steps/Html/DomQueryInterface.php b/src/Steps/Html/DomQueryInterface.php deleted file mode 100644 index 035af2a..0000000 --- a/src/Steps/Html/DomQueryInterface.php +++ /dev/null @@ -1,20 +0,0 @@ -getMessage(), + $originalException->getCode(), + $originalException, + ); + + $exception->setDomQuery($domQuery); + + return $exception; + } + public function setDomQuery(string $domQuery): void { $this->query = $domQuery; diff --git a/src/Steps/Html/GetLink.php b/src/Steps/Html/GetLink.php index 55edd1c..606baa5 100644 --- a/src/Steps/Html/GetLink.php +++ b/src/Steps/Html/GetLink.php @@ -2,17 +2,18 @@ namespace Crwlr\Crawler\Steps\Html; +use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException; use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest; +use Crwlr\Crawler\Steps\Dom\HtmlDocument; +use Crwlr\Crawler\Steps\Dom\HtmlElement; use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException; use Crwlr\Crawler\Steps\Loading\Http; use Crwlr\Crawler\Steps\Step; use Crwlr\Crawler\Steps\StepOutputType; use Crwlr\Url\Url; -use DOMNode; use Exception; use Generator; use InvalidArgumentException; -use Symfony\Component\DomCrawler\Crawler; class GetLink extends Step { @@ -44,9 +45,9 @@ public function __construct(null|string|CssSelector $selector = null) $this->selector = is_string($selector) ? new CssSelector($selector) : $selector; } - public static function isSpecialNonHttpLink(Crawler $linkElement): bool + public static function isSpecialNonHttpLink(HtmlElement $linkElement): bool { - $href = $linkElement->attr('href') ?? ''; + $href = $linkElement->getAttribute('href') ?? ''; return str_starts_with($href, 'mailto:') || str_starts_with($href, 'tel:') || @@ -58,7 +59,10 @@ public function outputType(): StepOutputType return StepOutputType::Scalar; } - protected function validateAndSanitizeInput(mixed $input): Crawler + /** + * @throws MissingZlibExtensionException + */ + protected function validateAndSanitizeInput(mixed $input): HtmlDocument { if (!$input instanceof RespondedRequest) { throw new InvalidArgumentException('Input must be an instance of RespondedRequest.'); @@ -66,11 +70,11 @@ protected function validateAndSanitizeInput(mixed $input): Crawler $this->baseUri = Url::parse($input->effectiveUri()); - return new Crawler(Http::getBodyString($input)); + return new HtmlDocument(Http::getBodyString($input)); } /** - * @param Crawler $input + * @param HtmlDocument $input * @return Generator * @throws Exception */ @@ -84,7 +88,7 @@ protected function invoke(mixed $input): Generator $selector = new CssSelector($selector); } - foreach ($selector->filter($input) as $link) { + foreach ($input->querySelectorAll($selector->query) as $link) { $linkUrl = $this->getLinkUrl($link); if ($linkUrl) { @@ -166,9 +170,9 @@ public function withoutFragment(): static /** * @throws Exception */ - protected function getBaseFromDocument(Crawler $document): void + protected function getBaseFromDocument(HtmlDocument $document): void { - $baseHref = DomQuery::getBaseHrefFromDocument($document); + $baseHref = $document->getBaseHref(); if (!empty($baseHref)) { $this->baseUri = $this->baseUri->resolve($baseHref); @@ -178,22 +182,20 @@ protected function getBaseFromDocument(Crawler $document): void /** * @throws Exception */ - protected function getLinkUrl(DOMNode $link): ?Url + protected function getLinkUrl(HtmlElement $link): ?Url { - if ($link->nodeName !== 'a') { - $this->logger?->warning('Selector matched <' . $link->nodeName . '> html element. Ignored it.'); + if ($link->nodeName() !== 'a') { + $this->logger?->warning('Selector matched <' . $link->nodeName() . '> html element. Ignored it.'); return null; } - $link = new Crawler($link); - if (self::isSpecialNonHttpLink($link)) { return null; } $linkUrl = $this->handleUrlFragment( - $this->baseUri->resolve($link->attr('href') ?? ''), + $this->baseUri->resolve($link->getAttribute('href') ?? ''), ); if ($this->matchesAdditionalCriteria($linkUrl)) { diff --git a/src/Steps/Html/GetLinks.php b/src/Steps/Html/GetLinks.php index 04ac2d9..5a0cf42 100644 --- a/src/Steps/Html/GetLinks.php +++ b/src/Steps/Html/GetLinks.php @@ -2,14 +2,14 @@ namespace Crwlr\Crawler\Steps\Html; +use Crwlr\Crawler\Steps\Dom\HtmlDocument; use Exception; use Generator; -use Symfony\Component\DomCrawler\Crawler; class GetLinks extends GetLink { /** - * @param Crawler $input + * @param HtmlDocument $input * @return Generator * @throws Exception */ @@ -23,7 +23,7 @@ protected function invoke(mixed $input): Generator $selector = new CssSelector($selector); } - foreach ($selector->filter($input) as $link) { + foreach ($input->querySelectorAll($selector->query) as $link) { $linkUrl = $this->getLinkUrl($link); if ($linkUrl) { diff --git a/src/Steps/Html/MetaData.php b/src/Steps/Html/MetaData.php index 02ad146..d1b9392 100644 --- a/src/Steps/Html/MetaData.php +++ b/src/Steps/Html/MetaData.php @@ -2,11 +2,11 @@ namespace Crwlr\Crawler\Steps\Html; +use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException; +use Crwlr\Crawler\Steps\Dom\HtmlDocument; use Crwlr\Crawler\Steps\Step; use Crwlr\Crawler\Steps\StepOutputType; -use DOMElement; use Generator; -use Symfony\Component\DomCrawler\Crawler; class MetaData extends Step { @@ -31,14 +31,13 @@ public function outputType(): StepOutputType } /** - * @param Crawler $input + * @param HtmlDocument $input */ protected function invoke(mixed $input): Generator { $data = $this->addToData([], 'title', $this->getTitle($input)); - foreach ($input->filter('meta') as $metaElement) { - /** @var DOMElement $metaElement */ + foreach ($input->querySelectorAll('meta') as $metaElement) { $metaName = $metaElement->getAttribute('name'); if (empty($metaName)) { @@ -46,24 +45,27 @@ protected function invoke(mixed $input): Generator } if (!empty($metaName) && (empty($this->onlyKeys) || in_array($metaName, $this->onlyKeys, true))) { - $data = $this->addToData($data, $metaName, $metaElement->getAttribute('content')); + $data = $this->addToData($data, $metaName, $metaElement->getAttribute('content') ?? ''); } } yield $data; } + /** + * @throws MissingZlibExtensionException + */ protected function validateAndSanitizeInput(mixed $input): mixed { - return $this->validateAndSanitizeToDomCrawlerInstance($input); + return $this->validateAndSanitizeToHtmlDocumentInstance($input); } - protected function getTitle(Crawler $document): string + protected function getTitle(HtmlDocument $document): string { - $titleElement = $document->filter('title'); + $titleElement = $document->querySelector('title'); - if ($titleElement->count() > 0) { - return $titleElement->first()->text(); + if ($titleElement) { + return $titleElement->text(); } return ''; diff --git a/src/Steps/Html/SchemaOrg.php b/src/Steps/Html/SchemaOrg.php index 34a4a5b..313d773 100644 --- a/src/Steps/Html/SchemaOrg.php +++ b/src/Steps/Html/SchemaOrg.php @@ -3,6 +3,7 @@ namespace Crwlr\Crawler\Steps\Html; use Adbar\Dot; +use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException; use Crwlr\Crawler\Steps\Step; use Crwlr\Crawler\Steps\StepOutputType; use Generator; @@ -48,6 +49,9 @@ public function outputType(): StepOutputType return StepOutputType::AssociativeArrayOrObject; } + /** + * @param string $input + */ protected function invoke(mixed $input): Generator { $data = \Crwlr\SchemaOrg\SchemaOrg::fromHtml($input, $this->logger); @@ -63,6 +67,9 @@ protected function invoke(mixed $input): Generator } } + /** + * @throws MissingZlibExtensionException + */ protected function validateAndSanitizeInput(mixed $input): string { return $this->validateAndSanitizeStringOrHttpResponse($input); diff --git a/src/Steps/Html/SelectorTarget.php b/src/Steps/Html/SelectorTarget.php index 289794e..1e5626a 100644 --- a/src/Steps/Html/SelectorTarget.php +++ b/src/Steps/Html/SelectorTarget.php @@ -5,9 +5,12 @@ enum SelectorTarget { case Text; + case FormattedText; + case Html; - case InnerText; + case Attribute; + case OuterHtml; } diff --git a/src/Steps/Html/XPathQuery.php b/src/Steps/Html/XPathQuery.php index e00c0f8..d8884d2 100644 --- a/src/Steps/Html/XPathQuery.php +++ b/src/Steps/Html/XPathQuery.php @@ -2,10 +2,16 @@ namespace Crwlr\Crawler\Steps\Html; +use Crwlr\Crawler\Steps\Dom\Node; +use Crwlr\Crawler\Steps\Dom\NodeList; use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException; use DOMDocument; use DOMXPath; -use Symfony\Component\DomCrawler\Crawler; + +/** + * @deprecated As the usage of XPath queries is no longer an option with the new DOM API introduced in + * PHP 8.4, please switch to using CSS selectors instead! + */ class XPathQuery extends DomQuery { @@ -19,9 +25,9 @@ public function __construct(string $query) parent::__construct($query); } - public function filter(Crawler $domCrawler): Crawler + protected function filter(Node $node): NodeList { - return $domCrawler->filterXPath($this->query); + return $node->queryXPath($this->query); } /** diff --git a/src/Steps/Json.php b/src/Steps/Json.php index 6d9fe46..f6d685f 100644 --- a/src/Steps/Json.php +++ b/src/Steps/Json.php @@ -3,10 +3,10 @@ namespace Crwlr\Crawler\Steps; use Adbar\Dot; +use Crwlr\Crawler\Steps\Dom\HtmlDocument; use Crwlr\Utils\Json as JsonUtil; use Crwlr\Utils\Exceptions\InvalidJsonException; use Generator; -use Symfony\Component\DomCrawler\Crawler; use Throwable; class Json extends Step @@ -89,7 +89,7 @@ protected function inputStringToArray(string $input): ?array // If headless browser is used in loader, the JSON in the response body is wrapped in an HTML document. if (str_contains($input, 'filter('body')->text(); + $bodyText = (new HtmlDocument($input))->querySelector('body')?->text() ?? ''; return JsonUtil::stringToArray($bodyText); } catch (Throwable) { diff --git a/src/Steps/Loading/Http/Document.php b/src/Steps/Loading/Http/Document.php index 59cd719..28df312 100644 --- a/src/Steps/Loading/Http/Document.php +++ b/src/Steps/Loading/Http/Document.php @@ -3,16 +3,15 @@ namespace Crwlr\Crawler\Steps\Loading\Http; use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest; -use Crwlr\Crawler\Steps\Html\DomQuery; +use Crwlr\Crawler\Steps\Dom\HtmlDocument; use Crwlr\Crawler\Steps\Loading\Http; use Crwlr\Url\Url; use Exception; use Psr\Log\LoggerInterface; -use Symfony\Component\DomCrawler\Crawler; final class Document { - private Crawler $dom; + private HtmlDocument $dom; private Url $url; @@ -26,12 +25,12 @@ public function __construct( ) { $responseBody = Http::getBodyString($this->respondedRequest); - $this->dom = new Crawler($responseBody); + $this->dom = new HtmlDocument($responseBody); $this->setBaseUrl(); } - public function dom(): Crawler + public function dom(): HtmlDocument { return $this->dom; } @@ -49,10 +48,10 @@ public function baseUrl(): Url public function canonicalUrl(): string { if ($this->canonicalUrl === null) { - $canonicalLinkElement = $this->dom->filter('link[rel=canonical]')->first(); + $canonicalLinkElement = $this->dom->querySelector('link[rel=canonical]'); - if ($canonicalLinkElement->count() > 0) { - $canonicalHref = $canonicalLinkElement->first()->attr('href'); + if ($canonicalLinkElement) { + $canonicalHref = $canonicalLinkElement->getAttribute('href'); if ($canonicalHref) { try { @@ -77,7 +76,7 @@ private function setBaseUrl(): void $this->baseUrl = $this->url; - $documentBaseHref = DomQuery::getBaseHrefFromDocument($this->dom); + $documentBaseHref = $this->dom->getBaseHref(); if ($documentBaseHref) { try { diff --git a/src/Steps/Loading/Http/Paginator.php b/src/Steps/Loading/Http/Paginator.php index abfd65a..41630c7 100644 --- a/src/Steps/Loading/Http/Paginator.php +++ b/src/Steps/Loading/Http/Paginator.php @@ -2,7 +2,7 @@ namespace Crwlr\Crawler\Steps\Loading\Http; -use Crwlr\Crawler\Steps\Html\DomQueryInterface; +use Crwlr\Crawler\Steps\Html\DomQuery; use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException; use Crwlr\Crawler\Steps\Loading\Http\Paginators\QueryParamsPaginator; use Crwlr\Crawler\Steps\Loading\Http\Paginators\SimpleWebsitePaginator; @@ -15,7 +15,7 @@ class Paginator * @throws InvalidDomQueryException */ public static function simpleWebsite( - string|DomQueryInterface $paginationLinksSelector, + string|DomQuery $paginationLinksSelector, int $maxPages = self::MAX_PAGES_DEFAULT, ): SimpleWebsitePaginator { return new SimpleWebsitePaginator($paginationLinksSelector, $maxPages); diff --git a/src/Steps/Loading/Http/Paginators/SimpleWebsitePaginator.php b/src/Steps/Loading/Http/Paginators/SimpleWebsitePaginator.php index 614ca4c..3e93b5b 100644 --- a/src/Steps/Loading/Http/Paginators/SimpleWebsitePaginator.php +++ b/src/Steps/Loading/Http/Paginators/SimpleWebsitePaginator.php @@ -4,8 +4,8 @@ use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest; use Crwlr\Crawler\Steps\Dom; +use Crwlr\Crawler\Steps\Html\CssSelector; use Crwlr\Crawler\Steps\Html\DomQuery; -use Crwlr\Crawler\Steps\Html\DomQueryInterface; use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException; use Crwlr\Crawler\Steps\Loading\Http; use Crwlr\Crawler\Utils\RequestKey; @@ -13,7 +13,6 @@ use Exception; use Psr\Http\Message\RequestInterface; use Psr\Log\LoggerInterface; -use Symfony\Component\DomCrawler\Crawler; class SimpleWebsitePaginator extends Http\AbstractPaginator { @@ -27,7 +26,7 @@ class SimpleWebsitePaginator extends Http\AbstractPaginator */ protected array $loadedUrls = []; - protected DomQueryInterface $paginationLinksSelector; + protected DomQuery $paginationLinksSelector; protected string $latestRequestKey = ''; @@ -39,7 +38,7 @@ class SimpleWebsitePaginator extends Http\AbstractPaginator /** * @throws InvalidDomQueryException */ - public function __construct(string|DomQueryInterface $paginationLinksSelector, int $maxPages = 1000) + public function __construct(string|DomQuery $paginationLinksSelector, int $maxPages = 1000) { if (is_string($paginationLinksSelector)) { $this->paginationLinksSelector = Dom::cssSelector($paginationLinksSelector); @@ -114,25 +113,24 @@ protected function getPaginationLinksFromResponse(RespondedRequest $respondedReq { $responseBody = Http::getBodyString($respondedRequest); - $dom = new Crawler($responseBody); + $document = new Dom\HtmlDocument($responseBody); - $paginationLinksElements = $this->paginationLinksSelector->filter($dom); + $paginationLinksElements = $this->paginationLinksSelector instanceof CssSelector ? + $document->querySelectorAll($this->paginationLinksSelector->query) : + $document->queryXPath($this->paginationLinksSelector->query); foreach ($paginationLinksElements as $paginationLinksElement) { - $paginationLinksElement = new Crawler($paginationLinksElement); - + /** @var Dom\HtmlElement $paginationLinksElement */ $this->addFoundUrlFromLinkElement( $paginationLinksElement, - $dom, + $document, $respondedRequest->effectiveUri(), ); - foreach ($paginationLinksElement->filter('a') as $linkInPaginationLinksElement) { - $linkInPaginationLinksElement = new Crawler($linkInPaginationLinksElement); - + foreach ($paginationLinksElement->querySelectorAll('a') as $linkInPaginationLinksElement) { $this->addFoundUrlFromLinkElement( $linkInPaginationLinksElement, - $dom, + $document, $respondedRequest->effectiveUri(), ); } @@ -143,8 +141,8 @@ protected function getPaginationLinksFromResponse(RespondedRequest $respondedReq * @throws Exception */ protected function addFoundUrlFromLinkElement( - Crawler $linkElement, - Crawler $document, + Dom\HtmlElement $linkElement, + Dom\HtmlDocument $document, string $documentUrl, ): void { if ($this->isRelevantLinkElement($linkElement)) { @@ -158,30 +156,30 @@ protected function addFoundUrlFromLinkElement( * @throws Exception */ protected function getAbsoluteUrlFromLinkElement( - Crawler $linkElement, - Crawler $document, + Dom\HtmlElement $linkElement, + Dom\HtmlDocument $document, string $documentUrl, ): string { $baseUrl = Url::parse($documentUrl); - $baseHref = DomQuery::getBaseHrefFromDocument($document); + $baseHref = $document->getBaseHref(); if ($baseHref) { $baseUrl = $baseUrl->resolve($baseHref); } - $linkHref = $linkElement->attr('href') ?? ''; + $linkHref = $linkElement->getAttribute('href') ?? ''; return $baseUrl->resolve($linkHref)->__toString(); } - protected function isRelevantLinkElement(Crawler $element): bool + protected function isRelevantLinkElement(Dom\HtmlElement $element): bool { if ($element->nodeName() !== 'a') { return false; } - $href = $element->attr('href'); + $href = $element->getAttribute('href'); return !empty($href) && !str_starts_with($href, '#'); } diff --git a/src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInDom.php b/src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInDom.php index 7ef2492..d14ed82 100644 --- a/src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInDom.php +++ b/src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInDom.php @@ -2,20 +2,24 @@ namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules; +use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException; use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest; +use Crwlr\Crawler\Steps\Dom\DomDocument; +use Crwlr\Crawler\Steps\Dom\HtmlElement; +use Crwlr\Crawler\Steps\Dom\XmlElement; use Crwlr\Crawler\Steps\Html\CssSelector; -use Crwlr\Crawler\Steps\Html\DomQueryInterface; +use Crwlr\Crawler\Steps\Html\DomQuery; use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException; use Crwlr\Crawler\Steps\Loading\Http; use Psr\Http\Message\RequestInterface; -use Symfony\Component\DomCrawler\Crawler; +use Throwable; abstract class IsEmptyInDom implements StopRule { - public function __construct(protected string|DomQueryInterface $selector) {} + public function __construct(protected string|DomQuery $selector) {} /** - * @throws InvalidDomQueryException + * @throws InvalidDomQueryException|MissingZlibExtensionException */ public function shouldStop(RequestInterface $request, ?RespondedRequest $respondedRequest): bool { @@ -23,24 +27,38 @@ public function shouldStop(RequestInterface $request, ?RespondedRequest $respond return true; } - $content = trim(Http::getBodyString($respondedRequest->response)); + $source = trim(Http::getBodyString($respondedRequest->response)); - $dom = new Crawler($content); + try { + $document = $this->makeDom($source); + } catch (Throwable $exception) { + return true; + } - $domQuery = $this->selector instanceof DomQueryInterface ? $this->selector : new CssSelector($this->selector); + $domQuery = $this->selector instanceof DomQuery ? $this->selector : new CssSelector($this->selector); - $filtered = $domQuery->filter($dom); + $filtered = $domQuery instanceof CssSelector ? + $document->querySelectorAll($domQuery->query) : + $document->queryXPath($domQuery->query); if ($filtered->count() === 0) { return true; } foreach ($filtered as $element) { - if (trim((new Crawler($element))->html()) !== '') { + /** @var HtmlElement|XmlElement $element */ + if (!$this->nodeIsEmpty($element)) { return false; } } return true; } + + abstract protected function makeDom(string $source): DomDocument; + + private function nodeIsEmpty(HtmlElement|XmlElement $node): bool + { + return $node instanceof HtmlElement ? trim($node->innerHtml()) === '' : trim($node->innerXml()) === ''; + } } diff --git a/src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInHtml.php b/src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInHtml.php index 58433ee..a017386 100644 --- a/src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInHtml.php +++ b/src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInHtml.php @@ -2,4 +2,13 @@ namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules; -class IsEmptyInHtml extends IsEmptyInDom {} +use Crwlr\Crawler\Steps\Dom\DomDocument; +use Crwlr\Crawler\Steps\Dom\HtmlDocument; + +class IsEmptyInHtml extends IsEmptyInDom +{ + protected function makeDom(string $source): DomDocument + { + return new HtmlDocument($source); + } +} diff --git a/src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInXml.php b/src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInXml.php index 82a2bdd..8341937 100644 --- a/src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInXml.php +++ b/src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInXml.php @@ -2,4 +2,13 @@ namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules; -class IsEmptyInXml extends IsEmptyInDom {} +use Crwlr\Crawler\Steps\Dom\DomDocument; +use Crwlr\Crawler\Steps\Dom\XmlDocument; + +class IsEmptyInXml extends IsEmptyInDom +{ + protected function makeDom(string $source): DomDocument + { + return new XmlDocument($source); + } +} diff --git a/src/Steps/Loading/Http/Paginators/StopRules/PaginatorStopRules.php b/src/Steps/Loading/Http/Paginators/StopRules/PaginatorStopRules.php index e4d2d4d..b1bcb8b 100644 --- a/src/Steps/Loading/Http/Paginators/StopRules/PaginatorStopRules.php +++ b/src/Steps/Loading/Http/Paginators/StopRules/PaginatorStopRules.php @@ -2,7 +2,7 @@ namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules; -use Crwlr\Crawler\Steps\Html\DomQueryInterface; +use Crwlr\Crawler\Steps\Html\DomQuery; class PaginatorStopRules { @@ -16,12 +16,12 @@ public static function isEmptyInJson(string $dotNotationKey): IsEmptyInJson return new IsEmptyInJson($dotNotationKey); } - public static function isEmptyInHtml(string|DomQueryInterface $selector): IsEmptyInHtml + public static function isEmptyInHtml(string|DomQuery $selector): IsEmptyInHtml { return new IsEmptyInHtml($selector); } - public static function isEmptyInXml(string|DomQueryInterface $selector): IsEmptyInXml + public static function isEmptyInXml(string|DomQuery $selector): IsEmptyInXml { return new IsEmptyInXml($selector); } diff --git a/src/Steps/Loading/HttpCrawl.php b/src/Steps/Loading/HttpCrawl.php index bfd9280..ba28d3e 100644 --- a/src/Steps/Loading/HttpCrawl.php +++ b/src/Steps/Loading/HttpCrawl.php @@ -4,14 +4,16 @@ use Closure; use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest; +use Crwlr\Crawler\Steps\Dom\HtmlElement; +use Crwlr\Crawler\Steps\Dom\XmlDocument; use Crwlr\Crawler\Steps\Html\GetLink; use Crwlr\Crawler\Steps\Loading\Http\Document; use Crwlr\Crawler\Steps\Sitemap\GetUrlsFromSitemap; +use Crwlr\Utils\PhpVersion; use Crwlr\Url\Url; use Exception; use Generator; use Psr\Http\Message\UriInterface; -use Symfony\Component\DomCrawler\Crawler; use Throwable; class HttpCrawl extends Http @@ -254,12 +256,16 @@ protected function getUrlsFromInitialResponse(RespondedRequest $respondedRequest */ protected function getUrlsFromSitemap(RespondedRequest $respondedRequest): array { - $domCrawler = GetUrlsFromSitemap::fixUrlSetTag(new Crawler(Http::getBodyString($respondedRequest))); + $document = new XmlDocument(Http::getBodyString($respondedRequest)); + + if (PhpVersion::isBelow(8, 4)) { + $document = GetUrlsFromSitemap::fixUrlSetTag($document); + } $urls = []; - foreach ($domCrawler->filter('urlset url loc') as $url) { - $url = $this->handleUrlFragment(Url::parse($url->textContent)); + foreach ($document->querySelectorAll('urlset url loc') as $url) { + $url = $this->handleUrlFragment(Url::parse($url->text())); if (!$this->isOnSameHostOrDomain($url)) { continue; @@ -291,17 +297,15 @@ protected function getUrlsFromHtmlDocument(Document $document): array $urls = []; - foreach ($document->dom()->filter('a') as $link) { - $linkElement = new Crawler($link); - - if (GetLink::isSpecialNonHttpLink($linkElement)) { + foreach ($document->dom()->querySelectorAll('a') as $link) { + if (GetLink::isSpecialNonHttpLink($link)) { continue; } try { - $url = $this->handleUrlFragment($document->baseUrl()->resolve($linkElement->attr('href') ?? '')); + $url = $this->handleUrlFragment($document->baseUrl()->resolve($link->getAttribute('href') ?? '')); } catch (Throwable) { - $this->logger?->warning('Failed to resolve a link with href: ' . $linkElement->attr('href')); + $this->logger?->warning('Failed to resolve a link with href: ' . $link->getAttribute('href')); continue; } @@ -310,7 +314,7 @@ protected function getUrlsFromHtmlDocument(Document $document): array continue; } - $matchesCriteria = $this->matchesCriteriaBesidesHostOrDomain($url, $linkElement); + $matchesCriteria = $this->matchesCriteriaBesidesHostOrDomain($url, $link); if (!$matchesCriteria && !$this->loadAll) { continue; @@ -410,7 +414,7 @@ protected function depthIsExceeded(int $depth): bool /** * @throws Exception */ - protected function matchesAllCriteria(Url $url, ?Crawler $linkElement = null): bool + protected function matchesAllCriteria(Url $url, ?HtmlElement $linkElement = null): bool { return $this->isOnSameHostOrDomain($url) && $this->matchesCriteriaBesidesHostOrDomain($url, $linkElement); } @@ -418,7 +422,7 @@ protected function matchesAllCriteria(Url $url, ?Crawler $linkElement = null): b /** * @throws Exception */ - protected function matchesCriteriaBesidesHostOrDomain(Url $url, ?Crawler $linkElement = null): bool + protected function matchesCriteriaBesidesHostOrDomain(Url $url, ?HtmlElement $linkElement = null): bool { return $this->matchesPathCriteria($url) && $this->matchesCustomCriteria($url, $linkElement); @@ -451,7 +455,7 @@ protected function matchesPathCriteria(Url $url): bool ($this->pathRegex === null || preg_match($this->pathRegex, $path) === 1); } - protected function matchesCustomCriteria(Url $url, ?Crawler $linkElement): bool + protected function matchesCustomCriteria(Url $url, ?HtmlElement $linkElement): bool { return $this->customClosure === null || $this->customClosure->call($this, $url, $linkElement); } diff --git a/src/Steps/Sitemap/GetUrlsFromSitemap.php b/src/Steps/Sitemap/GetUrlsFromSitemap.php index 773c4e3..c3c9211 100644 --- a/src/Steps/Sitemap/GetUrlsFromSitemap.php +++ b/src/Steps/Sitemap/GetUrlsFromSitemap.php @@ -2,10 +2,13 @@ namespace Crwlr\Crawler\Steps\Sitemap; +use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException; +use Crwlr\Crawler\Steps\Dom\XmlDocument; +use Crwlr\Crawler\Steps\Dom\XmlElement; use Crwlr\Crawler\Steps\Step; use Crwlr\Crawler\Steps\StepOutputType; +use Crwlr\Utils\PhpVersion; use Generator; -use Symfony\Component\DomCrawler\Crawler; class GetUrlsFromSitemap extends Step { @@ -17,10 +20,10 @@ class GetUrlsFromSitemap extends Step * Symfony's DomCrawler component has problems when a sitemap's tag contains certain attributes. * So, if the count of urls in the sitemap is zero, try to remove all attributes from the tag. */ - public static function fixUrlSetTag(Crawler $dom): Crawler + public static function fixUrlSetTag(XmlDocument $dom): XmlDocument { - if ($dom->filter('urlset url')->count() === 0) { - return new Crawler(preg_replace('//', '', $dom->outerHtml())); + if ($dom->querySelectorAll('urlset url')->count() === 0) { + return new XmlDocument(preg_replace('//', '', $dom->outerXml()) ?? $dom->outerXml()); } return $dom; @@ -39,43 +42,46 @@ public function outputType(): StepOutputType } /** - * @param Crawler $input + * @param XmlDocument $input */ protected function invoke(mixed $input): Generator { - $input = self::fixUrlSetTag($input); - - foreach ($input->filter('urlset url') as $urlNode) { - $urlNode = new Crawler($urlNode); + if (PhpVersion::isBelow(8, 4)) { + $input = self::fixUrlSetTag($input); + } - if ($urlNode->children('loc')->first()->count() > 0) { + foreach ($input->querySelectorAll('urlset url') as $urlNode) { + if ($urlNode->querySelector('loc')) { if ($this->withData) { yield $this->getWithAdditionalData($urlNode); } else { - yield $urlNode->children('loc')->first()->text(); + yield $urlNode->querySelector('loc')->text(); } } } } + /** + * @throws MissingZlibExtensionException + */ protected function validateAndSanitizeInput(mixed $input): mixed { - return $this->validateAndSanitizeToDomCrawlerInstance($input); + return $this->validateAndSanitizeToXmlDocumentInstance($input); } /** * @return string[] */ - protected function getWithAdditionalData(Crawler $urlNode): array + protected function getWithAdditionalData(XmlElement $urlNode): array { - $data = ['url' => $urlNode->children('loc')->first()->text()]; + $data = ['url' => $urlNode->querySelector('loc')?->text() ?? '']; $properties = ['lastmod', 'changefreq', 'priority']; foreach ($properties as $property) { - $node = $urlNode->children($property)->first(); + $node = $urlNode->querySelector($property); - if ($node->count() > 0) { + if ($node) { $data[$property] = $node->text(); } } diff --git a/src/Steps/Step.php b/src/Steps/Step.php index 414470e..917f74f 100644 --- a/src/Steps/Step.php +++ b/src/Steps/Step.php @@ -7,6 +7,8 @@ use Crwlr\Crawler\Input; use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest; use Crwlr\Crawler\Output; +use Crwlr\Crawler\Steps\Dom\HtmlDocument; +use Crwlr\Crawler\Steps\Dom\XmlDocument; use Crwlr\Crawler\Steps\Loading\Http; use Crwlr\Url\Url; use Exception; @@ -14,7 +16,6 @@ use InvalidArgumentException; use Psr\Http\Message\ResponseInterface; use Psr\Http\Message\UriInterface; -use Symfony\Component\DomCrawler\Crawler; abstract class Step extends BaseStep { @@ -175,11 +176,21 @@ protected function validateAndSanitizeToUriInterface( /** * @throws MissingZlibExtensionException */ - protected function validateAndSanitizeToDomCrawlerInstance( + protected function validateAndSanitizeToHtmlDocumentInstance( mixed $inputValue, string $exceptionMessage = 'Input must be string, stringable or HTTP response (RespondedRequest)', - ): Crawler { - return new Crawler($this->validateAndSanitizeStringOrHttpResponse($inputValue, $exceptionMessage)); + ): HtmlDocument { + return new HtmlDocument($this->validateAndSanitizeStringOrHttpResponse($inputValue, $exceptionMessage)); + } + + /** + * @throws MissingZlibExtensionException + */ + protected function validateAndSanitizeToXmlDocumentInstance( + mixed $inputValue, + string $exceptionMessage = 'Input must be string, stringable or HTTP response (RespondedRequest)', + ): XmlDocument { + return new XmlDocument($this->validateAndSanitizeStringOrHttpResponse($inputValue, $exceptionMessage)); } protected function getSingleElementFromArray(mixed $inputValue): mixed diff --git a/src/Steps/Xml.php b/src/Steps/Xml.php index b08dda6..40b3a2f 100644 --- a/src/Steps/Xml.php +++ b/src/Steps/Xml.php @@ -2,17 +2,34 @@ namespace Crwlr\Crawler\Steps; -use Crwlr\Crawler\Steps\Html\DomQueryInterface; +use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException; +use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest; +use Crwlr\Crawler\Steps\Dom\XmlDocument; +use Crwlr\Crawler\Steps\Html\CssSelector; +use Crwlr\Crawler\Steps\Html\DomQuery; use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException; -use Crwlr\Crawler\Steps\Html\XPathQuery; class Xml extends Dom { /** * @throws InvalidDomQueryException */ - public function makeDefaultDomQueryInstance(string $query): DomQueryInterface + public function makeDefaultDomQueryInstance(string $query): DomQuery { - return new XPathQuery($query); + return new CssSelector($query); + } + + /** + * @param mixed $input + * @return XmlDocument + * @throws MissingZlibExtensionException + */ + protected function validateAndSanitizeInput(mixed $input): XmlDocument + { + if ($input instanceof RespondedRequest) { + $this->baseUrl = $input->effectiveUri(); + } + + return $this->validateAndSanitizeToXmlDocumentInstance($input); } } diff --git a/tests/Steps/Dom/HtmlDocumentTest.php b/tests/Steps/Dom/HtmlDocumentTest.php new file mode 100644 index 0000000..cb8a839 --- /dev/null +++ b/tests/Steps/Dom/HtmlDocumentTest.php @@ -0,0 +1,88 @@ +foohello'; + + $document = new HtmlDocument($html); + + expect($document->getBaseHref())->toBe('/foo/bar'); +}); + +it('gets the href of the first base tag in the document', function () { + $html = << + + foo + + + + hey + + HTML; + + $document = new HtmlDocument($html); + + expect($document->getBaseHref())->toBe('/foo'); +}); + +test('getBaseHref() returns null if the document does not contain a base tag', function () { + $html = 'foohey'; + + $document = new HtmlDocument($html); + + expect($document->getBaseHref())->toBeNull(); +}); + +test('the querySelector() method returns an HtmlElement object', function () { + $html = 'foo
hello
'; + + $document = new HtmlDocument($html); + + expect($document->querySelector('.element'))->toBeInstanceOf(HtmlElement::class); +}); + +test('the querySelectorAll() method returns a NodeList of HtmlElement objects', function () { + $html = 'foo
  • foo
  • bar
'; + + $document = new HtmlDocument($html); + + $nodeList = $document->querySelectorAll('ul li'); + + expect($nodeList)->toBeInstanceOf(NodeList::class); + + $anyNodesChecked = false; + + foreach ($nodeList as $node) { + expect($node)->toBeInstanceOf(HtmlElement::class); + + $anyNodesChecked = true; + } + + expect($anyNodesChecked)->toBeTrue(); +}); + +test('the queryXPath() method returns a NodeList of HtmlElement objects', function () { + $html = 'foo
  • foo
  • bar
'; + + $document = new HtmlDocument($html); + + $nodeList = $document->queryXPath('//ul/li'); + + expect($nodeList)->toBeInstanceOf(NodeList::class); + + $anyNodesChecked = false; + + foreach ($nodeList as $node) { + expect($node)->toBeInstanceOf(HtmlElement::class); + + $anyNodesChecked = true; + } + + expect($anyNodesChecked)->toBeTrue(); +}); diff --git a/tests/Steps/Dom/HtmlElementTest.php b/tests/Steps/Dom/HtmlElementTest.php new file mode 100644 index 0000000..2b1c625 --- /dev/null +++ b/tests/Steps/Dom/HtmlElementTest.php @@ -0,0 +1,180 @@ + + + +
+ + + HTML; + + $document = new HtmlDocument($html); + + $wrapperElement = $document->querySelector('#wrapper'); + + expect($wrapperElement)->toBeInstanceOf(HtmlElement::class) + ->and($wrapperElement?->querySelector('.element'))->toBeInstanceOf(HtmlElement::class); +}); + +test('child nodes selected via querySelectorAll() are HtmlElement instances', function () { + $html = << + + +
+
foo
+
bar
+
+ + + HTML; + + $document = new HtmlDocument($html); + + $wrapperElement = $document->querySelector('#wrapper'); + + expect($wrapperElement)->toBeInstanceOf(HtmlElement::class); + + $childNodeList = $wrapperElement?->querySelectorAll('.element'); + + expect($childNodeList)->toBeInstanceOf(NodeList::class) + ->and($childNodeList?->count())->toBe(2) + ->and($childNodeList?->first())->toBeInstanceOf(HtmlElement::class) + ->and($childNodeList?->last())->toBeInstanceOf(HtmlElement::class); +}); + +test('child nodes selected via queryXPath() are HtmlElement instances', function () { + $html = << + + +
+
foo
+
bar
+
+ + + HTML; + + $document = new HtmlDocument($html); + + $wrapperElement = $document->queryXPath('//*[@id="wrapper"]')->first(); + + expect($wrapperElement)->toBeInstanceOf(HtmlElement::class); + + $childNodeList = $wrapperElement?->queryXPath('//*[contains(@class, "element")]'); + + expect($childNodeList)->toBeInstanceOf(NodeList::class) + ->and($childNodeList?->count())->toBe(2) + ->and($childNodeList?->first())->toBeInstanceOf(HtmlElement::class) + ->and($childNodeList?->first()?->text())->toBe('foo') + ->and($childNodeList?->last())->toBeInstanceOf(HtmlElement::class) + ->and($childNodeList?->last()?->text())->toBe('bar'); +}); + +it('gets the node name', function () { + $html = << + + +
+ + + HTML; + + $document = new HtmlDocument($html); + + $node = $document->querySelector('.element'); + + expect($node?->nodeName())->toBe('div') + ->and($node?->querySelector('.child')?->nodeName())->toBe('span'); +}); + +it('gets the text of a node', function () { + $html = << + + +
+ bli bla blub +
+ + + HTML; + + $document = new HtmlDocument($html); + + $node = $document->querySelector('.element'); + + expect($node?->text())->toBe('bli bla blub'); +}); + +it('gets the outer HTML of a node', function () { + $html = << + + +
+ bli bla blub +
+ + + HTML; + + $document = new HtmlDocument($html); + + $node = $document->querySelector('.element'); + + expect($node?->outerHtml())->toBe( + '
' . PHP_EOL . + ' bli bla blub' . PHP_EOL . + '
', + ); +}); + +it('gets the inner HTML of a node', function () { + $html = << + + +
+ bli bla blub +
+ + + HTML; + + $document = new HtmlDocument($html); + + $node = $document->querySelector('.element'); + + expect($node?->innerHtml())->toBe( + PHP_EOL . + ' bli bla blub' . PHP_EOL, + ); +}); + +it('gets an attribute from a node', function () { + $html = << + + + Link + + + HTML; + + $document = new HtmlDocument($html); + + $node = $document->querySelector('.element'); + + expect($node?->getAttribute('href'))->toBe('/foo/bar'); +}); diff --git a/tests/Steps/Dom/NodeListTest.php b/tests/Steps/Dom/NodeListTest.php new file mode 100644 index 0000000..2ee4441 --- /dev/null +++ b/tests/Steps/Dom/NodeListTest.php @@ -0,0 +1,220 @@ + + + +
  • foo
  • bar
  • baz
+ + + HTML; + + $crawler = new Crawler($html); + + $filtered = $crawler->filter('ul li'); + + $nodeList = new NodeList( + $filtered, + function (object $node): HtmlElement { + /** @var \Dom\Node|DOMNode|Crawler $node */ + return new HtmlElement($node); + }, + ); + + expect($nodeList->count())->toBe(3) + ->and($nodeList->first()?->text())->toBe('foo') + ->and($nodeList->nth(2)?->text())->toBe('bar') + ->and($nodeList->last()?->text())->toBe('baz') + ->and($nodeList->each(fn($node) => $node->text()))->toBe(['foo', 'bar', 'baz']); +}); + +it('can be constructed from a \Dom\NodeList instance', function () { + $html = << + + +
  • foo
  • bar
  • baz
+ + + HTML; + + $document = \Dom\HTMLDocument::createFromString($html, LIBXML_NOERROR); + + $nodeList = new NodeList( + $document->querySelectorAll('ul li'), + function (object $node): HtmlElement { + /** @var \Dom\Node|DOMNode|Crawler $node */ + return new HtmlElement($node); + }, + ); + + expect($nodeList->count())->toBe(3) + ->and($nodeList->first()?->text())->toBe('foo') + ->and($nodeList->nth(2)?->text())->toBe('bar') + ->and($nodeList->last()?->text())->toBe('baz') + ->and($nodeList->each(fn($node) => $node->text()))->toBe(['foo', 'bar', 'baz']); +})->group('php84'); + +it('can be instantiated from an array of Nodes (object instances from this library)', function () { + $html = << + + +
+
foo
bar
baz
+
+ + + HTML; + + $document = new HtmlDocument($html); + + $array = []; + + foreach ($document->querySelectorAll('.list .element') as $node) { + $array[] = $node; + } + + $newNodeList = new NodeList($array); + + expect($newNodeList->count())->toBe(3) + ->and($newNodeList->first()?->text())->toBe('foo') + ->and($newNodeList->last()?->text())->toBe('baz') + ->and($newNodeList->nth(2)?->text())->toBe('bar'); +}); + +it('gets the count of the node list', function () { + $html = << + + Foo + + +
  • foo
  • bar
  • baz
+ + + HTML; + + $document = new HtmlDocument($html); + + expect($document->querySelectorAll('ul li')->count())->toBe(3); +}); + +it('can be iterated and the elements are instances of Crwlr\Crawler\Steps\Dom\Node', function () { + $html = << + + Foo + + +
  • foo
  • bar
  • baz
+ + + HTML; + + $document = new HtmlDocument($html); + + $iteratesAnyNodes = false; + + foreach ($document->querySelectorAll('ul li') as $node) { + expect($node)->toBeInstanceOf(Node::class); + + $iteratesAnyNodes = true; + } + + expect($iteratesAnyNodes)->toBeTrue(); +}); + +it( + 'can be iterated with the each() method and return values are returned as an array from the each() call', + function () { + $html = << + + +
+
foo
+
bar
+
baz
+
quz
+
+ + + HTML; + + $document = new HtmlDocument($html); + + $result = $document->querySelectorAll('.list .element')->each(function ($node) { + return $node->text() . ' check'; + }); + + expect($result)->toBe([ + 'foo check', + 'bar check', + 'baz check', + 'quz check', + ]); + }, +); + +test('an empty NodeList can be iterated', function () { + $html = << + + Foo + + +
  • foo
  • bar
  • baz
+ + + HTML; + + $document = new HtmlDocument($html); + + $iteratesAnyNodes = false; + + foreach ($document->querySelectorAll('ul lulu') as $node) { + $iteratesAnyNodes = true; + } + + expect($iteratesAnyNodes)->toBeFalse(); +}); + +it('returns the first, last and nth element of the NodeList', function () { + $html = << + + +
+
foo
+
bar
+
baz
+
quz
+
+ + + HTML; + + $document = new HtmlDocument($html); + + $list = $document->querySelectorAll('.list .element'); + + expect($list->first())->toBeInstanceOf(HtmlElement::class) + ->and($list->first()?->text())->toBe('foo') + ->and($list->nth(2))->toBeInstanceOf(HtmlElement::class) + ->and($list->nth(2)?->text())->toBe('bar') + ->and($list->nth(3))->toBeInstanceOf(HtmlElement::class) + ->and($list->nth(3)?->text())->toBe('baz') + ->and($list->last())->toBeInstanceOf(HtmlElement::class) + ->and($list->last()?->text())->toBe('quz'); +}); diff --git a/tests/Steps/Dom/NodeTest.php b/tests/Steps/Dom/NodeTest.php new file mode 100644 index 0000000..8f0a8ab --- /dev/null +++ b/tests/Steps/Dom/NodeTest.php @@ -0,0 +1,615 @@ +filter($selectNode)->first(); +} + +/** + * @throws Exception + */ +function helper_getLegacyDomNodeInstanceFromSource(string $source, string $selectNode = 'body'): DOMNode +{ + $node = (new Crawler($source))->filter($selectNode)->first()->getNode(0); + + if (!$node) { + throw new Exception('Can\'t get legacy node'); + } + + return $node; +} + +function helper_getPhp84HtmlDomNodeInstanceFromSource(string $source, string $selectNode = 'body'): \Dom\Node +{ + return HTMLDocument::createFromString($source, LIBXML_NOERROR)->querySelector($selectNode); +} + +function helper_getPhp84XmlDomNodeInstanceFromSource(string $source, string $selectNode = 'body'): \Dom\Node +{ + return XMLDocument::createFromString($source, LIBXML_NOERROR)->querySelector($selectNode); +} + +/** + * @param \Dom\Node|DOMNode|Crawler $originalNode + */ +function helper_getAbstractNodeInstance(object $originalNode, bool $html = true): HtmlNodeStub|XmlNodeStub +{ + if ($html) { + return new HtmlNodeStub($originalNode); + } + + return new XmlNodeStub($originalNode); +} + +it('can be created from a \DOM\Node instance', function () { + $xml = << + + 1Foo + + XML; + + $domNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'items item'); + + expect($domNode)->toBeInstanceOf(\Dom\Node::class); + + $node = new class ($domNode) extends Node { + protected function makeChildNodeInstance(object $node): Node + { + return new XmlElement($node); + } + }; + + expect($node)->toBeInstanceOf(Node::class) + ->and($node->text())->toBe('1Foo'); +})->group('php84'); + +it('can be instantiated from a symfony Crawler instance', function () { + $xml = << + + 1Foo + + XML; + + $crawler = helper_getSymfonyCrawlerInstanceFromSource($xml, 'items item'); + + expect($crawler)->toBeInstanceOf(Crawler::class); + + $node = new class ($crawler) extends Node { + protected function makeChildNodeInstance(object $node): Node + { + return new XmlElement($node); + } + }; + + expect($node)->toBeInstanceOf(Node::class) + ->and($node->text())->toBe('1Foo'); +}); + +it('can be instantiated from a DOMNode instance', function () { + $xml = << + + 1Foo + + XML; + + $domNode = helper_getLegacyDomNodeInstanceFromSource($xml, 'items item'); + + expect($domNode)->toBeInstanceOf(DOMNode::class); + + $node = new class ($domNode) extends Node { + protected function makeChildNodeInstance(object $node): Node + { + return new XmlElement($node); + } + }; + + expect($node)->toBeInstanceOf(Node::class) + ->and($node->text())->toBe('1Foo'); +}); + +$html = << + + Foo + + +
+

Title

+
+ + + HTML; + +it('selects an element within a node via querySelector()', function (object $originalNode) { + /** @var Crawler|DOMNode $originalNode */ + $node = helper_getAbstractNodeInstance($originalNode); + + $selectedNode = $node->querySelector('.foo h1'); + + expect($selectedNode)->toBeInstanceOf(Node::class) + ->and($selectedNode?->text())->toBe('Title'); +})->with([ + [helper_getSymfonyCrawlerInstanceFromSource($html)], + [helper_getLegacyDomNodeInstanceFromSource($html)], +]); + +it('selects an element within a node via querySelector() in PHP >= 8.4', function () use ($html) { + $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html); + + $node = helper_getAbstractNodeInstance($originalNode); + + $selectedNode = $node->querySelector('.foo h1'); + + expect($selectedNode)->toBeInstanceOf(Node::class) + ->and($selectedNode?->text())->toBe('Title'); +})->group('php84'); + +$html = << + Bar + +
+

Foo

+
+
+

Bar

+
+ + + HTML; + +test( + 'querySelector() selects the first element within a node, when multiple nodes match a selector', + function (object $originalNode) { + /** @var Crawler|DOMNode $originalNode */ + $node = helper_getAbstractNodeInstance($originalNode); + + $selectedNode = $node->querySelector('.foo h2'); + + expect($selectedNode)->toBeInstanceOf(Node::class) + ->and($selectedNode?->text())->toBe('Foo'); + }, +)->with([ + [helper_getSymfonyCrawlerInstanceFromSource($html)], + [helper_getLegacyDomNodeInstanceFromSource($html)], +]); + +it( + 'selects the first element within a node using querySelector(), when multiple nodes match a selector in PHP >= 8.4', + function () use ($html) { + $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html); + + $node = helper_getAbstractNodeInstance($originalNode); + + $selectedNode = $node->querySelector('.foo h2'); + + expect($selectedNode)->toBeInstanceOf(Node::class) + ->and($selectedNode?->text())->toBe('Foo'); + }, +)->group('php84'); + +$html = << + Foo + + yo + + + HTML; + +it('returns null when the selector passed to querySelector() matches nothing', function (object $originalNode) { + /** @var Crawler|DOMNode $originalNode */ + $node = helper_getAbstractNodeInstance($originalNode); + + $selectedNode = $node->querySelector('.foo h2'); + + expect($selectedNode)->toBeNull(); +})->with([ + [helper_getSymfonyCrawlerInstanceFromSource($html)], + [helper_getLegacyDomNodeInstanceFromSource($html)], +]); + +it('returns null when the selector passed to querySelector() matches nothing in PHP >= 8.4', function () use ($html) { + $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html); + + $node = helper_getAbstractNodeInstance($originalNode); + + $selectedNode = $node->querySelector('.foo h2'); + + expect($selectedNode)->toBeNull(); +})->group('php84'); + +$xml = << + + + 1Foo + 2Bar + 3Baz + + + XML; + +it('selects all elements within a node, matching a selector using querySelectorAll()', function (object $originalNode) { + /** @var Crawler|DOMNode $originalNode */ + $node = helper_getAbstractNodeInstance($originalNode); + + $selected = $node->querySelectorAll('items item title'); + + expect($selected)->toBeInstanceOf(NodeList::class) + ->and($selected->count())->toBe(3) + ->and($selected->first()?->text())->toBe('Foo') + ->and($selected->nth(2)?->text())->toBe('Bar') + ->and($selected->last()?->text())->toBe('Baz'); +})->with([ + [helper_getSymfonyCrawlerInstanceFromSource($xml, 'feed')], + [helper_getLegacyDomNodeInstanceFromSource($xml, 'feed')], +]); + +it( + 'selects all elements within a node, matching a selector using querySelectorAll() in PHP >= 8.4', + function () use ($xml) { + $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'feed'); + + $node = helper_getAbstractNodeInstance($originalNode); + + $selected = $node->querySelectorAll('items item title'); + + expect($selected)->toBeInstanceOf(NodeList::class) + ->and($selected->count())->toBe(3) + ->and($selected->first()?->text())->toBe('Foo') + ->and($selected->nth(2)?->text())->toBe('Bar') + ->and($selected->last()?->text())->toBe('Baz'); + }, +)->group('php84'); + +$xml = << + + 123 + + XML; + +it( + 'gets an empty NodeList when nothing matches the selector passed to querySelectorAll()', + function (object $originalNode) { + /** @var Crawler|DOMNode $originalNode */ + $node = helper_getAbstractNodeInstance($originalNode); + + $selected = $node->querySelectorAll('items item author'); + + expect($selected)->toBeInstanceOf(NodeList::class) + ->and($selected->count())->toBe(0); + }, +)->with([ + [helper_getSymfonyCrawlerInstanceFromSource($xml, 'feed')], + [helper_getLegacyDomNodeInstanceFromSource($xml, 'feed')], +]); + +it( + 'gets an empty NodeList when nothing matches the selector passed to querySelectorAll() in PHP >= 8.4', + function () use ($xml) { + $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'feed'); + + $node = helper_getAbstractNodeInstance($originalNode); + + $selected = $node->querySelectorAll('items item author'); + + expect($selected)->toBeInstanceOf(NodeList::class) + ->and($selected->count())->toBe(0); + }, +)->group('php84'); + +$html = << + Lorem Ipsum + +
  • hip
  • hop
  • hooray
+ + + HTML; + +it( + 'selects all elements within a node, matching an XPath query using queryXPath()', + function (object $originalNode) { + /** @var Crawler|DOMNode $originalNode */ + $node = helper_getAbstractNodeInstance($originalNode); + + $selected = $node->queryXPath('//ul/li'); + + expect($selected)->toBeInstanceOf(NodeList::class) + ->and($selected->count())->toBe(3) + ->and($selected->first()?->text())->toBe('hip') + ->and($selected->nth(2)?->text())->toBe('hop') + ->and($selected->last()?->text())->toBe('hooray'); + }, +)->with([ + [helper_getSymfonyCrawlerInstanceFromSource($html)], + [helper_getLegacyDomNodeInstanceFromSource($html)], +]); + +it( + 'selects all elements within a node, matching an XPath query using queryXPath() in PHP >= 8.4', + function () use ($html) { + $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html); + + $node = helper_getAbstractNodeInstance($originalNode); + + $selected = $node->queryXPath('//ul/li'); + + expect($selected)->toBeInstanceOf(NodeList::class) + ->and($selected->count())->toBe(3) + ->and($selected->first()?->text())->toBe('hip') + ->and($selected->nth(2)?->text())->toBe('hop') + ->and($selected->last()?->text())->toBe('hooray'); + }, +)->group('php84'); + +it('gets an empty NodeList when nothing matches the selector passed to queryXPath()', function (object $originalNode) { + /** @var Crawler|DOMNode $originalNode */ + $node = helper_getAbstractNodeInstance($originalNode); + + $selected = $node->queryXPath('//ul/li/strong'); + + expect($selected)->toBeInstanceOf(NodeList::class) + ->and($selected->count())->toBe(0); +})->with([ + [helper_getSymfonyCrawlerInstanceFromSource($html)], + [helper_getLegacyDomNodeInstanceFromSource($html)], +]); + +it( + 'gets an empty NodeList when nothing matches the selector passed to queryXPath() in PHP => 8.4', + function () use ($html) { + $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html); + + $node = helper_getAbstractNodeInstance($originalNode); + + $selected = $node->queryXPath('//ul/li/strong'); + + expect($selected)->toBeInstanceOf(NodeList::class) + ->and($selected->count())->toBe(0); + }, +)->group('php84'); + +$html = << + Foo + +
+ + + HTML; + +it('gets the value of an attribute', function (object $originalNode) { + /** @var Crawler|DOMNode $originalNode */ + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->getAttribute('data-test'))->toBe('hi'); +})->with([ + [helper_getSymfonyCrawlerInstanceFromSource($html, '.element')], + [helper_getLegacyDomNodeInstanceFromSource($html, '.element')], +]); + +it('gets the value of an attribute in PHP >= 8.4', function () use ($html) { + $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, '.element'); + + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->getAttribute('data-test'))->toBe('hi'); +})->group('php84'); + +$html = << + Foo +
+ + HTML; + +it('returns null when an attribute does not exist', function (object $originalNode) { + /** @var Crawler|DOMNode $originalNode */ + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->getAttribute('data-test'))->toBeNull(); +})->with([ + [helper_getSymfonyCrawlerInstanceFromSource($html, '.element')], + [helper_getLegacyDomNodeInstanceFromSource($html, '.element')], +]); + +it('returns null when an attribute does not exist in PHP >= 8.4', function () use ($html) { + $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, '.element'); + + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->getAttribute('data-test'))->toBeNull(); +})->group('php84'); + +it('gets the name of a node', function (object $originalNode) { + /** @var Crawler|DOMNode $originalNode */ + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->nodeName())->toBe('div'); +})->with([ + [helper_getSymfonyCrawlerInstanceFromSource($html, '.element')], + [helper_getLegacyDomNodeInstanceFromSource($html, '.element')], +]); + +it('gets the name of a node in PHP >= 8.4', function () use ($html) { + $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, '.element'); + + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->nodeName())->toBe('div'); +})->group('php84'); + +$html = << + Bar + +

Title

Lorem ipsum.

+ + + HTML; + +it('gets the text content of an HTML node', function (object $originalNode) { + /** @var Crawler|DOMNode $originalNode */ + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->text())->toBe('Title Lorem ipsum.'); +})->with([ + [helper_getSymfonyCrawlerInstanceFromSource($html, 'article')], + [helper_getLegacyDomNodeInstanceFromSource($html, 'article')], +]); + +it('gets the text content of an HTML node in PHP >= 8.4', function () use ($html) { + $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, 'article'); + + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->text())->toBe('Title Lorem ipsum.'); +})->group('php84'); + +it('gets the inner source of an HTML node', function (object $originalNode) { + /** @var Crawler|DOMNode $originalNode */ + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->inner())->toBe('

Title

Lorem ipsum.

'); +})->with([ + [helper_getSymfonyCrawlerInstanceFromSource($html, 'article')], + [helper_getLegacyDomNodeInstanceFromSource($html, 'article')], +]); + +it('gets the inner source of an HTML node in PHP >= 8.4', function () use ($html) { + $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, 'article'); + + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->inner())->toBe('

Title

Lorem ipsum.

'); +})->group('php84'); + +it('gets the outer source of an HTML node', function (object $originalNode) { + /** @var Crawler|DOMNode $originalNode */ + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->outer())->toBe('

Title

Lorem ipsum.

'); +})->with([ + [helper_getSymfonyCrawlerInstanceFromSource($html, 'article')], + [helper_getLegacyDomNodeInstanceFromSource($html, 'article')], +]); + +it('gets the outer source of an HTML node in PHP >= 8.4', function () use ($html) { + $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, 'article'); + + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->outer())->toBe('

Title

Lorem ipsum.

'); +})->group('php84'); + +$xml = << + 1 Lorem Ipsum + XML; + +it('gets the text content of an XML node', function (object $originalNode) { + /** @var Crawler|DOMNode $originalNode */ + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->text())->toBe('1 Lorem Ipsum'); +})->with([ + [helper_getSymfonyCrawlerInstanceFromSource($xml, 'items item')], + [helper_getLegacyDomNodeInstanceFromSource($xml, 'items item')], +]); + +it('gets the text content of an XML node in PHP >= 8.4', function () use ($xml) { + $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'items item'); + + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->text())->toBe('1 Lorem Ipsum'); +})->group('php84'); + +it('gets the inner source of an XML node', function (object $originalNode) { + /** @var Crawler|DOMNode $originalNode */ + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->inner())->toBe(' 1 Lorem Ipsum '); +})->with([ + [helper_getSymfonyCrawlerInstanceFromSource($xml, 'items item')], + [helper_getLegacyDomNodeInstanceFromSource($xml, 'items item')], +]); + +it('gets the inner source of an XML node in PHP >= 8.4', function () use ($xml) { + $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'items item'); + + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->inner())->toBe(' 1 Lorem Ipsum '); +})->group('php84'); + +it('gets the outer source of an XML node', function (object $originalNode) { + /** @var Crawler|DOMNode $originalNode */ + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->outer())->toBe(' 1 Lorem Ipsum '); +})->with([ + [helper_getSymfonyCrawlerInstanceFromSource($xml, 'items item')], + [helper_getLegacyDomNodeInstanceFromSource($xml, 'items item')], +]); + +it('gets the outer source of an XML node in PHP >= 8.4', function () use ($xml) { + $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'items item'); + + $node = helper_getAbstractNodeInstance($originalNode); + + expect($node->outer())->toBe(' 1 Lorem Ipsum '); +})->group('php84'); + +$html = << + Bar + +
  • one
+ +
  • foo
+ + + HTML; + +it('selects elements using a CSS selector containing the :has() pseudo class', function () use ($html) { + $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html); + + $node = helper_getAbstractNodeInstance($originalNode); + + $selected = $node->querySelector('ul:has(.foo)'); + + expect($selected)->toBeInstanceOf(HtmlElement::class) + ->and($selected?->text())->toBe('one'); +})->group('php84'); + +it('selects elements using a CSS selector containing the :not() pseudo class', function () use ($html) { + $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html); + + $node = helper_getAbstractNodeInstance($originalNode); + + $selected = $node->querySelector('ul:not(:has(.foo))'); + + expect($selected)->toBeInstanceOf(HtmlElement::class) + ->and($selected?->text())->toBe('foo'); +})->group('php84'); diff --git a/tests/Steps/Dom/XmlDocumentTest.php b/tests/Steps/Dom/XmlDocumentTest.php new file mode 100644 index 0000000..490331b --- /dev/null +++ b/tests/Steps/Dom/XmlDocumentTest.php @@ -0,0 +1,70 @@ + + + 1 + + XML; + + $document = new XmlDocument($xml); + + expect($document->querySelector('feed items item'))->toBeInstanceOf(XmlElement::class); +}); + +test('the querySelectorAll() method returns a NodeList of XmlElement objects', function () { + $xml = << + + 123 + + XML; + + $document = new XmlDocument($xml); + + $nodeList = $document->querySelectorAll('feed items item'); + + expect($nodeList)->toBeInstanceOf(NodeList::class); + + $anyNodesChecked = false; + + foreach ($nodeList as $node) { + expect($node)->toBeInstanceOf(XmlElement::class); + + $anyNodesChecked = true; + } + + expect($anyNodesChecked)->toBeTrue(); +}); + +test('the queryXPath() method returns a NodeList of XmlElement objects', function () { + $xml = << + + 123 + + XML; + + $document = new XmlDocument($xml); + + $nodeList = $document->queryXPath('//feed/items/item'); + + expect($nodeList)->toBeInstanceOf(NodeList::class); + + $anyNodesChecked = false; + + foreach ($nodeList as $node) { + expect($node)->toBeInstanceOf(XmlElement::class); + + $anyNodesChecked = true; + } + + expect($anyNodesChecked)->toBeTrue(); +}); diff --git a/tests/Steps/Dom/XmlElementTest.php b/tests/Steps/Dom/XmlElementTest.php new file mode 100644 index 0000000..d1c5eac --- /dev/null +++ b/tests/Steps/Dom/XmlElementTest.php @@ -0,0 +1,96 @@ + + + foo + foo + + + abc-123 + 2024-11-07T11:00:31Z + Foo bar baz! + https://www.example.com/item-1?utm_source=foo&utm_medium=feed-xml + test + + + abc-124 + 2024-12-04T22:43:14Z + Lorem Ipsum! + https://www.example.com/item-2?utm_source=foo&utm_medium=feed-xml + heyho + + + + XML; + +test('child nodes selected via querySelector() are HtmlElement instances', function () use ($xml) { + $document = new XmlDocument($xml); + + $wrapperElement = $document->querySelector('feed'); + + expect($wrapperElement)->toBeInstanceOf(XmlElement::class) + ->and($wrapperElement?->querySelector('items item'))->toBeInstanceOf(XmlElement::class); +}); + +test('child nodes selected via querySelectorAll() are HtmlElement instances', function () use ($xml) { + $document = new XmlDocument($xml); + + $wrapperElement = $document->querySelector('feed'); + + expect($wrapperElement)->toBeInstanceOf(XmlElement::class); + + $childNodeList = $wrapperElement?->querySelectorAll('items item'); + + expect($childNodeList)->toBeInstanceOf(NodeList::class) + ->and($childNodeList?->count())->toBe(2) + ->and($childNodeList?->first())->toBeInstanceOf(XmlElement::class) + ->and($childNodeList?->last())->toBeInstanceOf(XmlElement::class); +}); + +it('gets the node name', function () use ($xml) { + $document = new XmlDocument($xml); + + $node = $document->querySelector('feed'); + + expect($node?->nodeName())->toBe('feed') + ->and($node?->querySelector('items item')?->nodeName())->toBe('item'); +}); + +it('gets the text of a node', function () use ($xml) { + $document = new XmlDocument($xml); + + $node = $document->querySelector('feed items item:nth-child(2) foo'); + + expect($node?->text())->toBe('heyho'); +}); + +it('gets the outer XML of a node', function () use ($xml) { + $document = new XmlDocument($xml); + + $node = $document->querySelector('feed items item foo baRbaz'); + + expect($node?->outerXml())->toBe('test'); +}); + +it('gets the inner XML of a node', function () use ($xml) { + $document = new XmlDocument($xml); + + $node = $document->querySelector('feed items item foo'); + + expect($node?->innerXml())->toBe(' test '); +}); + +it('gets an attribute from a node', function () use ($xml) { + $document = new XmlDocument($xml); + + $node = $document->querySelector('feed items item:first-child title'); + + expect($node?->getAttribute('lang'))->toBe('en'); +}); diff --git a/tests/Steps/Dom/_Stubs/HtmlNodeStub.php b/tests/Steps/Dom/_Stubs/HtmlNodeStub.php new file mode 100644 index 0000000..3967379 --- /dev/null +++ b/tests/Steps/Dom/_Stubs/HtmlNodeStub.php @@ -0,0 +1,24 @@ +innerSource(); + } + + public function outer(): string + { + return $this->outerSource(); + } + + protected function makeChildNodeInstance(object $node): Node + { + return new HtmlElement($node); + } +} diff --git a/tests/Steps/Dom/_Stubs/XmlNodeStub.php b/tests/Steps/Dom/_Stubs/XmlNodeStub.php new file mode 100644 index 0000000..06e55d0 --- /dev/null +++ b/tests/Steps/Dom/_Stubs/XmlNodeStub.php @@ -0,0 +1,24 @@ +innerSource(); + } + + public function outer(): string + { + return $this->outerSource(); + } + + protected function makeChildNodeInstance(object $node): Node + { + return new XmlElement($node); + } +} diff --git a/tests/Steps/DomTest.php b/tests/Steps/DomTest.php index 6d6c7ff..e72ec72 100644 --- a/tests/Steps/DomTest.php +++ b/tests/Steps/DomTest.php @@ -6,13 +6,12 @@ use Crwlr\Crawler\Input; use Crwlr\Crawler\Steps\Dom; use Crwlr\Crawler\Steps\Html\CssSelector; -use Crwlr\Crawler\Steps\Html\DomQueryInterface; +use Crwlr\Crawler\Steps\Html\DomQuery; use Crwlr\Crawler\Steps\Html\XPathQuery; use GuzzleHttp\Psr7\Request; use GuzzleHttp\Psr7\Response; use InvalidArgumentException; use stdClass; -use Symfony\Component\DomCrawler\Crawler; use function tests\helper_getStepFilesContent; use function tests\helper_invokeStepWithInput; @@ -24,7 +23,7 @@ function helper_getDomStepInstance(array $mapping = []): Dom { return new class ($mapping) extends Dom { - protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface + protected function makeDefaultDomQueryInstance(string $query): DomQuery { return new CssSelector($query); } @@ -42,7 +41,7 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface test('ResponseInterface is a valid input', function () { $output = helper_invokeStepWithInput(helper_getDomStepInstance()::root(), new Response()); - expect($output)->toHaveCount(0); + expect($output[0]->get())->toBe([]); }); test('RespondedRequest is a valid input', function () { @@ -51,7 +50,7 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface new RespondedRequest(new Request('GET', '/'), new Response()), ); - expect($output)->toHaveCount(0); + expect($output[0]->get())->toBe([]); }); test('For other inputs an InvalidArgumentException is thrown', function (mixed $input) { @@ -68,9 +67,8 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface helper_getStepFilesContent('Html/basic.html'), ); - expect($outputs)->toHaveCount(1); - - expect($outputs[0]->get())->toBe('match 2'); + expect($outputs)->toHaveCount(1) + ->and($outputs[0]->get())->toBe('match 2'); }); it('outputs multiple strings when argument for extract is a selector string matching multiple elements', function () { @@ -79,11 +77,9 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface helper_getStepFilesContent('Html/basic.html'), ); - expect($outputs)->toHaveCount(3); - - expect($outputs[0]->get())->toBe('match 1'); - - expect($outputs[2]->get())->toBe('match 3'); + expect($outputs)->toHaveCount(3) + ->and($outputs[0]->get())->toBe('match 1') + ->and($outputs[2]->get())->toBe('match 3'); }); it('also takes a DomQuery instance as argument for extract', function () { @@ -92,9 +88,8 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface helper_getStepFilesContent('Html/basic.html'), ); - expect($outputs)->toHaveCount(1); - - expect($outputs[0]->get())->toBe('match 2'); + expect($outputs)->toHaveCount(1) + ->and($outputs[0]->get())->toBe('match 2'); }); test('Extracting with single selector also works with each', function () { @@ -103,11 +98,9 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface helper_getStepFilesContent('Html/basic.html'), ); - expect($outputs)->toHaveCount(2); - - expect($outputs[0]->get())->toBe('match 2'); - - expect($outputs[1]->get())->toBe('match 3'); + expect($outputs)->toHaveCount(2) + ->and($outputs[0]->get())->toBe('match 2') + ->and($outputs[1]->get())->toBe('match 3'); }); test('Extracting with single selector also works with first', function () { @@ -116,9 +109,8 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface helper_getStepFilesContent('Html/basic.html'), ); - expect($outputs)->toHaveCount(1); - - expect($outputs[0]->get())->toBe('match 2'); + expect($outputs)->toHaveCount(1) + ->and($outputs[0]->get())->toBe('match 2'); }); test('Extracting with single selector also works with last', function () { @@ -127,9 +119,8 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface helper_getStepFilesContent('Html/basic.html'), ); - expect($outputs)->toHaveCount(1); - - expect($outputs[0]->get())->toBe('match 3'); + expect($outputs)->toHaveCount(1) + ->and($outputs[0]->get())->toBe('match 3'); }); test('Extracting with single selector that doesn\'t match anything doesn\'t yield any output', function () { @@ -147,9 +138,8 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface helper_getStepFilesContent('Html/basic.html'), ); - expect($output)->toHaveCount(1); - - expect($output[0]->get())->toBe(['matches' => ['match 1', 'match 2', 'match 3']]); + expect($output)->toHaveCount(1) + ->and($output[0]->get())->toBe(['matches' => ['match 1', 'match 2', 'match 3']]); }); it('extracts each matching result when the each method is used', function () { @@ -158,11 +148,9 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface helper_getStepFilesContent('Html/basic.html'), ); - expect($output)->toHaveCount(2); - - expect($output[0]->get())->toBe(['match' => 'match 2']); - - expect($output[1]->get())->toBe(['match' => 'match 3']); + expect($output)->toHaveCount(2) + ->and($output[0]->get())->toBe(['match' => 'match 2']) + ->and($output[1]->get())->toBe(['match' => 'match 3']); }); it('extracts the first matching result when the first method is used', function () { @@ -171,9 +159,8 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface helper_getStepFilesContent('Html/basic.html'), ); - expect($output)->toHaveCount(1); - - expect($output[0]->get())->toBe(['match' => 'match 2']); + expect($output)->toHaveCount(1) + ->and($output[0]->get())->toBe(['match' => 'match 2']); }); it('extracts the last matching result when the last method is used', function () { @@ -182,9 +169,8 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface helper_getStepFilesContent('Html/basic.html'), ); - expect($output)->toHaveCount(1); - - expect($output[0]->get())->toBe(['match' => 'match 3']); + expect($output)->toHaveCount(1) + ->and($output[0]->get())->toBe(['match' => 'match 3']); }); it('doesn\'t yield any output when the each selector doesn\'t match anything', function () { @@ -220,9 +206,8 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface helper_getStepFilesContent('Html/basic.html'), ); - expect($output)->toHaveCount(1); - - expect($output[0]->get())->toBe(['match' => 'match 3', 'noMatch' => null]); + expect($output)->toHaveCount(1) + ->and($output[0]->get())->toBe(['match' => 'match 3', 'noMatch' => null]); }); test('The static cssSelector method returns an instance of CssSelector using the provided selector', function () { @@ -230,7 +215,7 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface expect($cssSelector)->toBeInstanceOf(CssSelector::class); - $itemContent = $cssSelector->apply(new Crawler('yes')); + $itemContent = $cssSelector->apply(new Dom\HtmlDocument('yes')); expect($itemContent)->toBe('yes'); }); @@ -240,7 +225,7 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface expect($xPathQuery)->toBeInstanceOf(XPathQuery::class); - $itemContent = $xPathQuery->apply(new Crawler('yes')); + $itemContent = $xPathQuery->apply(new Dom\XmlDocument('yes')); expect($itemContent)->toBe('yes'); }); @@ -251,9 +236,8 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface '

foo content

bar content

baz content

', ); - expect($output)->toHaveCount(1); - - expect($output[0]->get())->toBe(['foo' => 'foo content', 'notBar' => 'bar content', 0 => 'baz content']); + expect($output)->toHaveCount(1) + ->and($output[0]->get())->toBe(['foo' => 'foo content', 'notBar' => 'bar content', 0 => 'baz content']); }); it('trims the extracted data', function () { @@ -262,9 +246,8 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface "

\n foo content \n \n

", ); - expect($output)->toHaveCount(1); - - expect($output[0]->get())->toBe(['foo' => 'foo content']); + expect($output)->toHaveCount(1) + ->and($output[0]->get())->toBe(['foo' => 'foo content']); }); it('automatically passes on the base url to dom query instances when the input is a RespondedRequest', function () { @@ -279,12 +262,11 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface ), ); - expect($output)->toHaveCount(1); - - expect($output[0]->get())->toBe([ - 'one' => 'https://www.example.com/foo/bar', - 'two' => 'https://www.example.com/yo/lo', - ]); + expect($output)->toHaveCount(1) + ->and($output[0]->get())->toBe([ + 'one' => 'https://www.example.com/foo/bar', + 'two' => 'https://www.example.com/yo/lo', + ]); }); it('removes the fragment part from URLs when the withoutFragment method is called on a DomQuery instance', function () { @@ -310,12 +292,11 @@ protected function makeDefaultDomQueryInstance(string $query): DomQueryInterface ), ); - expect($output)->toHaveCount(1); - - expect($output[0]->get())->toBe([ - 'one' => 'https://www.example.com/foo#foo', - 'two' => 'https://www.example.com/bar#bar', - 'three' => 'https://www.example.com/baz', - 'four' => 'https://www.example.com/quz', - ]); + expect($output)->toHaveCount(1) + ->and($output[0]->get())->toBe([ + 'one' => 'https://www.example.com/foo#foo', + 'two' => 'https://www.example.com/bar#bar', + 'three' => 'https://www.example.com/baz', + 'four' => 'https://www.example.com/quz', + ]); }); diff --git a/tests/Steps/Filters/ArrayFilterTest.php b/tests/Steps/Filters/ArrayFilterTest.php new file mode 100644 index 0000000..3130dfe --- /dev/null +++ b/tests/Steps/Filters/ArrayFilterTest.php @@ -0,0 +1,109 @@ +where(Filter::equal('foo')); + + expect($filter->evaluate($values))->toBe($evaluationResult); +})->with([ + [['foo', 'bar', 'baz'], true], + [['bar', 'baz', 'quz'], false], +]); + +it('filters a multi-level array by a key of the array elements (which are also arrays)', function () { + $values = [ + ['foo' => 'one', 'bar' => 'two'], + ['foo' => 'two', 'bar' => 'three'], + ['foo' => 'three', 'bar' => 'four'], + ]; + + $filter = Filter::arrayHasElement()->where('foo', Filter::equal('four')); + + expect($filter->evaluate($values))->toBeFalse(); + + $filter = Filter::arrayHasElement()->where('foo', Filter::equal('two')); + + expect($filter->evaluate($values))->toBeTrue(); +}); + +it('applies multiple complex filters on a multi-level array', function () { + $values = [ + [ + 'id' => '123', + 'name' => 'abc', + 'tags' => [ + ['type' => 'companyId', 'value' => '123'], + ['type' => 'type', 'value' => 'job-ad'], + ['type' => 'companyId', 'value' => '125'], + ], + ], + [ + 'id' => '124', + 'name' => 'abd', + 'tags' => [ + ['type' => 'companyId', 'value' => '123'], + ['type' => 'type', 'value' => 'blog-post'], + ['type' => 'author', 'value' => 'John Doe'], + ], + ], + [ + 'id' => '125', + 'name' => 'abf', + 'tags' => [ + ['type' => 'companyId', 'value' => '123'], + ['type' => 'companyId', 'value' => '124'], + ['type' => 'type', 'value' => 'job-ad'], + ['type' => 'companyId', 'value' => '125'], + ], + ], + ]; + + $filter = Filter::arrayHasElement() + ->where( + 'tags', + Filter::arrayHasElement() + ->where('type', Filter::equal('companyId')) + ->where('value', Filter::equal('123')), + ) + ->where( + 'tags', + Filter::arrayHasElement() + ->where('type', Filter::equal('companyId')) + ->where('value', Filter::equal('124')) + ->negate(), + ) + ->where( + 'tags', + Filter::arrayHasElement() + ->where('type', Filter::equal('type')) + ->where('value', Filter::equal('job-ad')), + ); + + expect($filter->evaluate($values))->toBeTrue(); + + $filter = Filter::arrayHasElement() + ->where( + 'tags', + Filter::arrayHasElement() + ->where('type', Filter::equal('companyId')) + ->where('value', Filter::equal('123')), + ) + ->where( + 'tags', + Filter::arrayHasElement() + ->where('type', Filter::equal('companyId')) + ->where('value', Filter::equal('125')) + ->negate(), + ) + ->where( + 'tags', + Filter::arrayHasElement() + ->where('type', Filter::equal('type')) + ->where('value', Filter::equal('job-ad')), + ); + + expect($filter->evaluate($values))->toBeFalse(); +}); diff --git a/tests/Steps/Filters/FilterTest.php b/tests/Steps/Filters/FilterTest.php index df5b172..b1d9811 100644 --- a/tests/Steps/Filters/FilterTest.php +++ b/tests/Steps/Filters/FilterTest.php @@ -2,13 +2,13 @@ namespace tests\Steps\Filters; -use Crwlr\Crawler\Steps\Filters\Filter; +use Crwlr\Crawler\Steps\Filters\AbstractFilter; use Exception; use InvalidArgumentException; use function tests\helper_getStdClassWithData; -class TestFilter extends Filter +class TestFilter extends AbstractFilter { public string $value = ''; diff --git a/tests/Steps/Html/CssSelectorTest.php b/tests/Steps/Html/CssSelectorTest.php index 80b2dae..be36042 100644 --- a/tests/Steps/Html/CssSelectorTest.php +++ b/tests/Steps/Html/CssSelectorTest.php @@ -2,10 +2,10 @@ namespace tests\Steps\Html; +use Crwlr\Crawler\Steps\Dom\HtmlDocument; use Crwlr\Crawler\Steps\Html\CssSelector; use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException; use Crwlr\Html2Text\Html2Text; -use Symfony\Component\DomCrawler\Crawler; use function tests\helper_getSimpleListHtml; @@ -16,25 +16,19 @@ test('The apply method returns a string for a single match', function () { $html = '
test
'; - $domCrawler = new Crawler($html); - - expect((new CssSelector('.item'))->apply($domCrawler))->toBe('test'); + expect((new CssSelector('.item'))->apply(new HtmlDocument($html)))->toBe('test'); }); test('The apply method returns an array of strings for multiple matches', function () { $html = '
test
test 2 sub
test 3
'; - $domCrawler = new Crawler($html); - - expect((new CssSelector('.item'))->apply($domCrawler))->toBe(['test', 'test 2 sub', 'test 3']); + expect((new CssSelector('.item'))->apply(new HtmlDocument($html)))->toBe(['test', 'test 2 sub', 'test 3']); }); test('The apply method returns null if nothing matches', function () { $html = '
test
'; - $domCrawler = new Crawler($html); - - expect((new CssSelector('.aitem'))->apply($domCrawler))->toBeNull(); + expect((new CssSelector('.aitem'))->apply(new HtmlDocument($html)))->toBeNull(); }); it('trims whitespace', function () { @@ -44,64 +38,26 @@ HTML; - $domCrawler = new Crawler($html); - - expect((new CssSelector('.item'))->apply($domCrawler))->toBe('test'); -}); - -test('The filter method returns the filtered Symfony DOM Crawler instance', function () { - $html = << -
one
-
two
-
three
- - HTML; - - $domCrawler = new Crawler($html); - - $filtered = (new CssSelector('#items .item[data-match=1]'))->filter($domCrawler); - - expect($filtered)->toBeInstanceOf(Crawler::class); - - expect($filtered->count())->toBe(2); - - expect($filtered->first()->outerHtml())->toBe('
one
'); - - expect($filtered->last()->outerHtml())->toBe('
two
'); + expect((new CssSelector('.item'))->apply(new HtmlDocument($html)))->toBe('test'); }); it('contains inner tags when the html method is called', function () { $html = '
test sub
'; - $domCrawler = new Crawler($html); - - expect((new CssSelector('.item'))->html()->apply($domCrawler))->toBe('test sub'); + expect((new CssSelector('.item'))->html()->apply(new HtmlDocument($html)))->toBe('test sub'); }); it('contains also the outer tag when the outerHtml method is called', function () { $html = '
test sub
'; - $domCrawler = new Crawler($html); - - expect((new CssSelector('.item'))->outerHtml()->apply($domCrawler)) + expect((new CssSelector('.item'))->outerHtml()->apply(new HtmlDocument($html))) ->toBe('
test sub
'); }); -it('does not contain text of children when innerText is called', function () { - $html = '
test sub
'; - - $domCrawler = new Crawler($html); - - expect((new CssSelector('.item'))->innerText()->apply($domCrawler))->toBe('test'); -}); - it('returns formatted text when formattedText() is called', function () { $html = '

headline

paragraph

  • item 1
  • item 2
'; - $domCrawler = new Crawler($html); - - expect((new CssSelector('#a'))->formattedText()->apply($domCrawler)) + expect((new CssSelector('#a'))->formattedText()->apply(new HtmlDocument($html))) ->toBe(<<removeConverter('ul'); - expect((new CssSelector('#a'))->formattedText($converter)->apply($domCrawler)) + expect((new CssSelector('#a'))->formattedText($converter)->apply(new HtmlDocument($html))) ->toBe(<<attribute('data-attr')->apply($domCrawler))->toBe('content'); + expect((new CssSelector('.item'))->attribute('data-attr')->apply(new HtmlDocument($html)))->toBe('content'); }); it('turns the value into an absolute url when toAbsoluteUrl() is called', function () { $html = 'getting started'; - $domCrawler = new Crawler($html); + $document = new HtmlDocument($html); $selector = new CssSelector('a'); $selector->setBaseUrl('https://www.crwlr.software/') ->attribute('href'); - expect($selector->apply($domCrawler))->toBe('/packages/crawler/v0.4/getting-started'); + expect($selector->apply($document))->toBe('/packages/crawler/v0.4/getting-started'); $selector->toAbsoluteUrl(); - expect($selector->apply($domCrawler))->toBe('https://www.crwlr.software/packages/crawler/v0.4/getting-started'); + expect($selector->apply($document))->toBe('https://www.crwlr.software/packages/crawler/v0.4/getting-started'); }); it( @@ -170,81 +122,69 @@ function () { HTML; - $domCrawler = new Crawler($html); + $document = new HtmlDocument($html); $selector = new CssSelector('a'); $selector->setBaseUrl('https://www.example.com/a/b') ->attribute('href'); - expect($selector->apply($domCrawler))->toBe('e'); + expect($selector->apply($document))->toBe('e'); $selector->toAbsoluteUrl(); - expect($selector->apply($domCrawler))->toBe('https://www.example.com/c/e'); + expect($selector->apply($document))->toBe('https://www.example.com/c/e'); }, ); it('gets an absolute link from the href attribute of a link element, when the link() method is called', function () { $html = ''; - $domCrawler = new Crawler($html); + $document = new HtmlDocument($html); $selector = new CssSelector('#foo .bar'); $selector->setBaseUrl('https://www.example.com/'); - expect($selector->apply($domCrawler))->toBe('Foo'); + expect($selector->apply($document))->toBe('Foo'); $selector->link(); - expect($selector->apply($domCrawler))->toBe('https://www.example.com/foo/bar'); + expect($selector->apply($document))->toBe('https://www.example.com/foo/bar'); }); it('gets only the first matching element when the first() method is called', function () { - $domCrawler = new Crawler(helper_getSimpleListHtml()); - $selector = (new CssSelector('#list .item'))->first(); - expect($selector->apply($domCrawler))->toBe('one'); + expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('one'); }); it('gets only the last matching element when the last() method is called', function () { - $domCrawler = new Crawler(helper_getSimpleListHtml()); - $selector = (new CssSelector('#list .item'))->last(); - expect($selector->apply($domCrawler))->toBe('four'); + expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('four'); }); it('gets only the nth matching element when the nth() method is called', function () { - $domCrawler = new Crawler(helper_getSimpleListHtml()); - $selector = (new CssSelector('#list .item'))->nth(3); - expect($selector->apply($domCrawler))->toBe('three'); + expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('three'); }); it('returns null when no nth matching element exists', function () { - $domCrawler = new Crawler(helper_getSimpleListHtml()); - $selector = (new CssSelector('#list .item'))->nth(5); - expect($selector->apply($domCrawler))->toBeNull(); + expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBeNull(); }); it('gets only even matching elements when the even() method is called', function () { - $domCrawler = new Crawler(helper_getSimpleListHtml()); - $selector = (new CssSelector('#list .item'))->even(); - expect($selector->apply($domCrawler))->toBe(['two', 'four']); + expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe(['two', 'four']); }); it('gets only odd matching elements when the odd() method is called', function () { - $domCrawler = new Crawler(helper_getSimpleListHtml()); - $selector = (new CssSelector('#list .item'))->odd(); - expect($selector->apply($domCrawler))->toBe(['one', 'three']); + expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe(['one', 'three']); }); diff --git a/tests/Steps/Html/XPathQueryTest.php b/tests/Steps/Html/XPathQueryTest.php index e74b66d..51c9a23 100644 --- a/tests/Steps/Html/XPathQueryTest.php +++ b/tests/Steps/Html/XPathQueryTest.php @@ -2,9 +2,10 @@ namespace tests\Steps\Html; +use Crwlr\Crawler\Steps\Dom\HtmlDocument; +use Crwlr\Crawler\Steps\Dom\XmlDocument; use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException; use Crwlr\Crawler\Steps\Html\XPathQuery; -use Symfony\Component\DomCrawler\Crawler; use function tests\helper_getSimpleListHtml; @@ -15,25 +16,19 @@ test('The apply method returns a string for a single match', function () { $xml = 'test'; - $domCrawler = new Crawler($xml); - - expect((new XPathQuery('//item'))->apply($domCrawler))->toBe('test'); + expect((new XPathQuery('//item'))->apply(new XmlDocument($xml)))->toBe('test'); }); test('The apply method returns an array of strings for multiple matches', function () { - $xml = 'testtest 2 subtest 3'; - - $domCrawler = new Crawler($xml); + $html = 'testtest 2 subtest 3'; - expect((new XPathQuery('//item'))->apply($domCrawler))->toBe(['test', 'test 2 sub', 'test 3']); + expect((new XPathQuery('//item'))->apply(new HtmlDocument($html)))->toBe(['test', 'test 2 sub', 'test 3']); }); test('The apply method returns null if nothing matches', function () { $xml = 'test'; - $domCrawler = new Crawler($xml); - - expect((new XPathQuery('//aitem'))->apply($domCrawler))->toBeNull(); + expect((new XPathQuery('//aitem'))->apply(new XmlDocument($xml)))->toBeNull(); }); it('trims whitespace', function () { @@ -43,133 +38,89 @@ XML; - $domCrawler = new Crawler($xml); - - expect((new XPathQuery('//item'))->apply($domCrawler))->toBe('test'); -}); - -test('The filter method returns the filtered Symfony DOM Crawler instance', function () { - $xml = 'onetwothree'; - - $domCrawler = new Crawler($xml); - - $filtered = (new XPathQuery('//items/item[@match=\'1\']'))->filter($domCrawler); - - expect($filtered)->toBeInstanceOf(Crawler::class); - - expect($filtered->count())->toBe(2); - - expect($filtered->first()->outerHtml())->toBe('one'); - - expect($filtered->last()->outerHtml())->toBe('two'); + expect((new XPathQuery('//item'))->apply(new XmlDocument($xml)))->toBe('test'); }); it('contains inner tags when the html method is called', function () { $xml = 'test sub'; - $domCrawler = new Crawler($xml); - - expect((new XPathQuery('//item'))->html()->apply($domCrawler))->toBe('test sub'); + expect((new XPathQuery('//item'))->html()->apply(new XmlDocument($xml)))->toBe('test sub'); }); it('contains also the outer tag when the outerHtml method is called', function () { $xml = 'test sub'; - $domCrawler = new Crawler($xml); - - expect((new XPathQuery('//item'))->outerHtml()->apply($domCrawler))->toBe('test sub'); -}); - -it('does not contain text of children when innerText is called', function () { - $xml = 'test sub'; - - $domCrawler = new Crawler($xml); - - expect((new XPathQuery('//item'))->innerText()->apply($domCrawler))->toBe('test'); + expect((new XPathQuery('//item'))->outerHtml()->apply(new XmlDocument($xml)))->toBe('test sub'); }); it('gets the contents of an attribute using the attribute method', function () { $xml = 'test'; - $domCrawler = new Crawler($xml); - - expect((new XPathQuery('//item'))->attribute('attr')->apply($domCrawler))->toBe('content'); + expect((new XPathQuery('//item'))->attribute('attr')->apply(new XmlDocument($xml)))->toBe('content'); }); it('turns the value into an absolute url when toAbsoluteUrl() is called', function () { - $html = '/foo/bar'; + $xml = '/foo/bar'; - $domCrawler = new Crawler($html); + $document = new XmlDocument($xml); $query = (new XPathQuery('//item')) ->setBaseUrl('https://www.example.com'); - expect($query->apply($domCrawler))->toBe('/foo/bar'); + expect($query->apply($document))->toBe('/foo/bar'); $query->toAbsoluteUrl(); - expect($query->apply($domCrawler))->toBe('https://www.example.com/foo/bar'); + expect($query->apply($document))->toBe('https://www.example.com/foo/bar'); }); it('gets an absolute link from the href attribute of a link element, when the link() method is called', function () { $html = ''; - $domCrawler = new Crawler($html); + $document = new HtmlDocument($html); $selector = (new XPathQuery('//*[@id=\'foo\']/a[@class=\'bar\']')) ->setBaseUrl('https://www.example.com/'); - expect($selector->apply($domCrawler))->toBe('Foo'); + expect($selector->apply($document))->toBe('Foo'); $selector->link(); - expect($selector->apply($domCrawler))->toBe('https://www.example.com/foo/bar'); + expect($selector->apply($document))->toBe('https://www.example.com/foo/bar'); }); it('gets only the first matching element when the first() method is called', function () { - $domCrawler = new Crawler(helper_getSimpleListHtml()); - $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->first(); - expect($selector->apply($domCrawler))->toBe('one'); + expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('one'); }); it('gets only the last matching element when the last() method is called', function () { - $domCrawler = new Crawler(helper_getSimpleListHtml()); - $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->last(); - expect($selector->apply($domCrawler))->toBe('four'); + expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('four'); }); it('gets only the nth matching element when the nth() method is called', function () { - $domCrawler = new Crawler(helper_getSimpleListHtml()); - $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->nth(3); - expect($selector->apply($domCrawler))->toBe('three'); + expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('three'); }); it('returns null when no nth matching element exists', function () { - $domCrawler = new Crawler(helper_getSimpleListHtml()); - $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->nth(5); - expect($selector->apply($domCrawler))->toBeNull(); + expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBeNull(); }); it('gets only even matching elements when the even() method is called', function () { - $domCrawler = new Crawler(helper_getSimpleListHtml()); - $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->even(); - expect($selector->apply($domCrawler))->toBe(['two', 'four']); + expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe(['two', 'four']); }); it('gets only odd matching elements when the odd() method is called', function () { - $domCrawler = new Crawler(helper_getSimpleListHtml()); - $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->odd(); - expect($selector->apply($domCrawler))->toBe(['one', 'three']); + expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe(['one', 'three']); }); diff --git a/tests/Steps/HtmlTest.php b/tests/Steps/HtmlTest.php index 7dd86b5..a847c35 100644 --- a/tests/Steps/HtmlTest.php +++ b/tests/Steps/HtmlTest.php @@ -29,11 +29,9 @@ function helper_getHtmlContent(string $fileName): string helper_getHtmlContent('bookstore.html'), ); - expect($output)->toHaveCount(4); - - expect($output[0]->get())->toBe('Everyday Italian'); - - expect($output[3]->get())->toBe('Learning XML'); + expect($output)->toHaveCount(4) + ->and($output[0]->get())->toBe('Everyday Italian') + ->and($output[3]->get())->toBe('Learning XML'); }); it('extracts data from an HTML document with CSS selectors by default', function () { @@ -42,23 +40,19 @@ function helper_getHtmlContent(string $fileName): string helper_getHtmlContent('bookstore.html'), ); - expect($output)->toHaveCount(4); - - expect($output[0]->get())->toBe( - ['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'], - ); - - expect($output[1]->get())->toBe(['title' => 'Harry Potter', 'author' => 'J K. Rowling', 'year' => '2005']); - - expect($output[2]->get())->toBe( - [ - 'title' => 'XQuery Kick Start', - 'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'], - 'year' => '2003', - ], - ); - - expect($output[3]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']); + expect($output)->toHaveCount(4) + ->and($output[0]->get())->toBe( + ['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'], + ) + ->and($output[1]->get())->toBe(['title' => 'Harry Potter', 'author' => 'J K. Rowling', 'year' => '2005']) + ->and($output[2]->get())->toBe( + [ + 'title' => 'XQuery Kick Start', + 'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'], + 'year' => '2003', + ], + ) + ->and($output[3]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']); }); it('can also extract data using XPath queries', function () { @@ -71,15 +65,14 @@ function helper_getHtmlContent(string $fileName): string helper_getHtmlContent('bookstore.html'), ); - expect($output)->toHaveCount(4); - - expect($output[2]->get())->toBe( - [ - 'title' => 'XQuery Kick Start', - 'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'], - 'year' => '2003', - ], - ); + expect($output)->toHaveCount(4) + ->and($output[2]->get())->toBe( + [ + 'title' => 'XQuery Kick Start', + 'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'], + 'year' => '2003', + ], + ); }); it('returns only one (compound) output when the root method is used', function () { @@ -88,9 +81,8 @@ function helper_getHtmlContent(string $fileName): string helper_getHtmlContent('bookstore.html'), ); - expect($output)->toHaveCount(1); - - expect($output[0]->get()['title'])->toBe(['Everyday Italian', 'Harry Potter', 'XQuery Kick Start', 'Learning XML']); + expect($output)->toHaveCount(1) + ->and($output[0]->get()['title'])->toBe(['Everyday Italian', 'Harry Potter', 'XQuery Kick Start', 'Learning XML']); }); it('extracts the data of the first matching element when the first method is used', function () { @@ -99,11 +91,10 @@ function helper_getHtmlContent(string $fileName): string helper_getHtmlContent('bookstore.html'), ); - expect($output)->toHaveCount(1); - - expect($output[0]->get())->toBe( - ['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'], - ); + expect($output)->toHaveCount(1) + ->and($output[0]->get())->toBe( + ['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'], + ); }); it('extracts the data of the last matching element when the last method is used', function () { @@ -112,9 +103,8 @@ function helper_getHtmlContent(string $fileName): string helper_getHtmlContent('bookstore.html'), ); - expect($output)->toHaveCount(1); - - expect($output[0]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']); + expect($output)->toHaveCount(1) + ->and($output[0]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']); }); test( @@ -140,30 +130,29 @@ function () { $response, ); - expect($output)->toHaveCount(1); - - expect($output[0]->get())->toBe([ - 'title' => 'Some Meetup', - 'location' => 'Somewhere', - 'date' => '2023-01-14 21:00', - 'talks' => [ - [ - 'title' => 'Sophisticated talk title', - 'speaker' => 'Super Mario', - 'slides' => 'https://www.example.com/meetups/some-meetup/slides/talk1.pdf', - ], - [ - 'title' => 'Simple beginner talk', - 'speaker' => 'Luigi', - 'slides' => 'https://www.example.com/meetups/some-meetup/slides/talk2.pdf', + expect($output)->toHaveCount(1) + ->and($output[0]->get())->toBe([ + 'title' => 'Some Meetup', + 'location' => 'Somewhere', + 'date' => '2023-01-14 21:00', + 'talks' => [ + [ + 'title' => 'Sophisticated talk title', + 'speaker' => 'Super Mario', + 'slides' => 'https://www.example.com/meetups/some-meetup/slides/talk1.pdf', + ], + [ + 'title' => 'Simple beginner talk', + 'speaker' => 'Luigi', + 'slides' => 'https://www.example.com/meetups/some-meetup/slides/talk2.pdf', + ], + [ + 'title' => 'Fun talk', + 'speaker' => 'Princess Peach', + 'slides' => 'https://www.example.com/meetups/some-meetup/slides/talk3.pdf', + ], ], - [ - 'title' => 'Fun talk', - 'speaker' => 'Princess Peach', - 'slides' => 'https://www.example.com/meetups/some-meetup/slides/talk3.pdf', - ], - ], - ]); + ]); }, ); diff --git a/tests/Steps/Loading/Http/DocumentTest.php b/tests/Steps/Loading/Http/DocumentTest.php index 1221b51..8aeca5f 100644 --- a/tests/Steps/Loading/Http/DocumentTest.php +++ b/tests/Steps/Loading/Http/DocumentTest.php @@ -3,12 +3,12 @@ namespace tests\Steps\Loading\Http; use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest; +use Crwlr\Crawler\Steps\Dom\HtmlDocument; use Crwlr\Crawler\Steps\Loading\Http\Document; use GuzzleHttp\Psr7\Request; use GuzzleHttp\Psr7\Response; -use Symfony\Component\DomCrawler\Crawler; -it('creates a symfony DomCrawler instance from a RespondedRequest', function () { +it('creates a HtmlDocument instance from a RespondedRequest', function () { $body = 'foohello'; $respondedRequest = new RespondedRequest( @@ -18,9 +18,8 @@ $document = new Document($respondedRequest); - expect($document->dom())->toBeInstanceOf(Crawler::class); - - expect($document->dom()->outerHtml())->toBe('foohello'); + expect($document->dom())->toBeInstanceOf(HtmlDocument::class) + ->and($document->dom()->outerHtml())->toBe('foohello'); }); it('returns the effectiveUri as url()', function () { diff --git a/tests/Steps/Sitemap/GetUrlsFromSitemapTest.php b/tests/Steps/Sitemap/GetUrlsFromSitemapTest.php index 356252a..41fe451 100644 --- a/tests/Steps/Sitemap/GetUrlsFromSitemapTest.php +++ b/tests/Steps/Sitemap/GetUrlsFromSitemapTest.php @@ -24,11 +24,9 @@ $outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap(), $xml); - expect($outputs)->toHaveCount(9); - - expect($outputs[0]->get())->toBe('https://www.crwlr.software/'); - - expect($outputs[8]->get())->toBe('https://www.crwlr.software/blog/prevent-homograph-attacks-in-user-input-urls'); + expect($outputs)->toHaveCount(9) + ->and($outputs[0]->get())->toBe('https://www.crwlr.software/') + ->and($outputs[8]->get())->toBe('https://www.crwlr.software/blog/prevent-homograph-attacks-in-user-input-urls'); }); it('gets all urls with additional data when the withData() method is used', function () { @@ -43,25 +41,22 @@ $outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap()->withData(), $xml); - expect($outputs)->toHaveCount(3); - - expect($outputs[0]->get())->toBe([ - 'url' => 'https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-5', - 'lastmod' => '2022-09-03', - 'priority' => '1', - ]); - - expect($outputs[1]->get())->toBe([ - 'url' => 'https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php', - 'lastmod' => '2022-06-02', - 'priority' => '1', - ]); - - expect($outputs[2]->get())->toBe([ - 'url' => 'https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-4', - 'lastmod' => '2022-05-10', - 'priority' => '0.7', - ]); + expect($outputs)->toHaveCount(3) + ->and($outputs[0]->get())->toBe([ + 'url' => 'https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-5', + 'lastmod' => '2022-09-03', + 'priority' => '1', + ]) + ->and($outputs[1]->get())->toBe([ + 'url' => 'https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php', + 'lastmod' => '2022-06-02', + 'priority' => '1', + ]) + ->and($outputs[2]->get())->toBe([ + 'url' => 'https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-4', + 'lastmod' => '2022-05-10', + 'priority' => '0.7', + ]); }); it('doesn\'t fail when sitemap is empty', function () { diff --git a/tests/Steps/XmlTest.php b/tests/Steps/XmlTest.php index 2884140..29b36da 100644 --- a/tests/Steps/XmlTest.php +++ b/tests/Steps/XmlTest.php @@ -13,96 +13,90 @@ it('returns single strings when extract is called with a selector only', function () { $output = helper_invokeStepWithInput( - Xml::each('bookstore/book')->extract('//title'), + Xml::each('bookstore book')->extract('title'), helper_getStepFilesContent('Xml/bookstore.xml'), ); - expect($output)->toHaveCount(4); - - expect($output[0]->get())->toBe('Everyday Italian'); - - expect($output[3]->get())->toBe('Learning XML'); + expect($output)->toHaveCount(4) + ->and($output[0]->get())->toBe('Everyday Italian') + ->and($output[3]->get())->toBe('Learning XML'); }); it('extracts data from an XML document with XPath queries per default', function () { $output = helper_invokeStepWithInput( - Xml::each('bookstore/book')->extract(['title' => '//title', 'author' => '//author', 'year' => '//year']), + Xml::each('bookstore book')->extract([ + 'title' => 'title', + 'author' => 'author', + 'year' => 'year', + ]), helper_getStepFilesContent('Xml/bookstore.xml'), ); - expect($output)->toHaveCount(4); - - expect($output[0]->get())->toBe( - ['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'], - ); - - expect($output[1]->get())->toBe(['title' => 'Harry Potter', 'author' => 'J K. Rowling', 'year' => '2005']); - - expect($output[2]->get())->toBe( - [ - 'title' => 'XQuery Kick Start', - 'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'], - 'year' => '2003', - ], - ); - - expect($output[3]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']); + expect($output)->toHaveCount(4) + ->and($output[0]->get())->toBe( + ['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'], + ) + ->and($output[1]->get())->toBe(['title' => 'Harry Potter', 'author' => 'J K. Rowling', 'year' => '2005']) + ->and($output[2]->get())->toBe( + [ + 'title' => 'XQuery Kick Start', + 'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'], + 'year' => '2003', + ], + ) + ->and($output[3]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']); }); -it('can also extract data using CSS selectors', function () { +it('can also extract data using XPath queries', function () { $output = helper_invokeStepWithInput( - Xml::each(Dom::cssSelector('bookstore book'))->extract([ - 'title' => Dom::cssSelector('title'), - 'author' => Dom::cssSelector('author'), - 'year' => Dom::cssSelector('year'), + Xml::each(Dom::xPath('//bookstore/book'))->extract([ + 'title' => Dom::xPath('//title'), + 'author' => Dom::xPath('//author'), + 'year' => Dom::xPath('//year'), ]), helper_getStepFilesContent('Xml/bookstore.xml'), ); - expect($output)->toHaveCount(4); - - expect($output[2]->get())->toBe( - [ - 'title' => 'XQuery Kick Start', - 'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'], - 'year' => '2003', - ], - ); + expect($output)->toHaveCount(4) + ->and($output[2]->get())->toBe( + [ + 'title' => 'XQuery Kick Start', + 'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'], + 'year' => '2003', + ], + ); }); it('returns only one (compound) output when the root method is used', function () { $output = helper_invokeStepWithInput( - Xml::root()->extract(['title' => '//title', 'author' => '//author', 'year' => '//year']), + Xml::root()->extract(['title' => 'title', 'author' => 'author', 'year' => 'year']), helper_getStepFilesContent('Xml/bookstore.xml'), ); - expect($output)->toHaveCount(1); - - expect($output[0]->get()['title'])->toBe(['Everyday Italian', 'Harry Potter', 'XQuery Kick Start', 'Learning XML']); + expect($output)->toHaveCount(1) + ->and($output[0]->get()['title'])->toBe(['Everyday Italian', 'Harry Potter', 'XQuery Kick Start', 'Learning XML']); }); it('extracts the data of the first matching element when the first method is used', function () { $output = helper_invokeStepWithInput( - Xml::first('bookstore/book')->extract(['title' => '//title', 'author' => '//author', 'year' => '//year']), + Xml::first('bookstore book')->extract(['title' => 'title', 'author' => 'author', 'year' => 'year']), helper_getStepFilesContent('Xml/bookstore.xml'), ); - expect($output)->toHaveCount(1); - - expect($output[0]->get())->toBe( - ['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'], - ); + expect($output)->toHaveCount(1) + ->and($output[0]->get())->toBe( + ['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'], + ); }); it('extracts the data of the last matching element when the last method is used', function () { $output = helper_invokeStepWithInput( - Xml::last('bookstore/book')->extract(['title' => '//title', 'author' => '//author', 'year' => '//year']), + Xml::last('bookstore book')->extract(['title' => 'title', 'author' => 'author', 'year' => 'year']), helper_getStepFilesContent('Xml/bookstore.xml'), ); - expect($output)->toHaveCount(1); - - expect($output[0]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']); + expect($output)->toHaveCount(1) + ->and($output[0]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']); }); test( @@ -115,51 +109,50 @@ function () { ); $outputs = helper_invokeStepWithInput( - Xml::each('//events/event')->extract([ - 'title' => '//name', - 'location' => '//location', - 'date' => '//date', - 'talks' => Xml::each('//talks/talk')->extract([ - 'title' => '//title', - 'speaker' => '//speaker', + Xml::each('events event')->extract([ + 'title' => 'name', + 'location' => 'location', + 'date' => 'date', + 'talks' => Xml::each('talks talk')->extract([ + 'title' => 'title', + 'speaker' => 'speaker', ]), ]), $response, ); - expect($outputs)->toHaveCount(2); - - expect($outputs[0]->get())->toBe([ - 'title' => 'Some Meetup', - 'location' => 'Somewhere', - 'date' => '2023-01-14 20:00', - 'talks' => [ - [ - 'title' => 'Sophisticated talk title', - 'speaker' => 'Super Mario', - ], - [ - 'title' => 'Fun talk', - 'speaker' => 'Princess Peach', + expect($outputs)->toHaveCount(2) + ->and($outputs[0]->get())->toBe([ + 'title' => 'Some Meetup', + 'location' => 'Somewhere', + 'date' => '2023-01-14 20:00', + 'talks' => [ + [ + 'title' => 'Sophisticated talk title', + 'speaker' => 'Super Mario', + ], + [ + 'title' => 'Fun talk', + 'speaker' => 'Princess Peach', + ], ], - ], - ]); - - expect($outputs[1]->get())->toBe([ - 'title' => 'Another Meetup', - 'location' => 'Somewhere else', - 'date' => '2023-01-21 19:00', - 'talks' => [ - [ - 'title' => 'Join the dark side', - 'speaker' => 'Wario', + ]) + ->and($outputs[1]->get())->toBe([ + 'title' => 'Another Meetup', + 'location' => 'Somewhere else', + 'date' => '2023-01-21 19:00', + 'talks' => [ + [ + 'title' => 'Join the dark side', + 'speaker' => 'Wario', + ], + [ + 'title' => 'Let\'s go', + 'speaker' => 'Yoshi', + ], ], - [ - 'title' => 'Let\'s go', - 'speaker' => 'Yoshi', - ], - ], - ]); + ]); + }, ); @@ -170,9 +163,9 @@ function () { ); $outputs = helper_invokeStepWithInput( - Xml::each('//channel/item')->extract([ - 'url' => '//link', - 'title' => '//title', + Xml::each('channel item')->extract([ + 'url' => 'link', + 'title' => 'title', ]), $response, ); @@ -182,3 +175,44 @@ function () { 'title' => 'Some title', ]); }); + +it('works with tags with camelCase names', function () { + $xml = << + + foo + foo + + + abc-123 + 2024-11-07T11:00:31Z + Foo bar baz! + https://www.example.com/item-1?utm_source=foo&utm_medium=feed-xml + + test + + + + + XML; + + $response = new RespondedRequest( + new Request('GET', 'https://www.example.com/xml-feed'), + new Response(body: $xml), + ); + + $outputs = helper_invokeStepWithInput( + Xml::each(Dom::cssSelector('feed items item'))->extract([ + 'title' => 'title', + 'some-url' => 'someUrl', + 'foo-bar-baz' => 'foo baRbaz', + ]), + $response, + ); + + expect($outputs[0]->get())->toBe([ + 'title' => 'Foo bar baz!', + 'some-url' => 'https://www.example.com/item-1?utm_source=foo&utm_medium=feed-xml', + 'foo-bar-baz' => 'test', + ]); +})->group('php84'); diff --git a/tests/_Integration/Http/CrawlingTest.php b/tests/_Integration/Http/CrawlingTest.php index cdaa076..7b57271 100644 --- a/tests/_Integration/Http/CrawlingTest.php +++ b/tests/_Integration/Http/CrawlingTest.php @@ -10,6 +10,7 @@ use Crwlr\Crawler\Loader\Http\Politeness\Throttler; use Crwlr\Crawler\Loader\Http\Politeness\TimingUnits\MultipleOf; use Crwlr\Crawler\Result; +use Crwlr\Crawler\Steps\Dom\HtmlElement; use Crwlr\Crawler\Steps\Loading\Http; use Crwlr\Crawler\UserAgents\UserAgent; use Crwlr\Crawler\UserAgents\UserAgentInterface; @@ -151,9 +152,8 @@ public function getLoader(): TestLoader $crawler->runAndTraverse(); - expect($crawler->getLoader()->loadedUrls)->toContain('http://foo.example.com/crawling/main-on-subdomain'); - - expect($crawler->getLoader()->loadedUrls)->not()->toContain('https://www.crwlr.software/packages/crawler'); + expect($crawler->getLoader()->loadedUrls)->toContain('http://foo.example.com/crawling/main-on-subdomain') + ->and($crawler->getLoader()->loadedUrls)->not()->toContain('https://www.crwlr.software/packages/crawler'); }); it('stays on the same host when method sameHost() is called', function () { @@ -177,19 +177,13 @@ public function getLoader(): TestLoader $crawler->runAndTraverse(); - expect($crawler->getLoader()->loadedUrls)->toHaveCount(6); - - expect($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/main'); - - expect($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1'); - - expect($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1/sub1'); - - expect($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2'); - - expect($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1'); - - expect($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1/sub1'); + expect($crawler->getLoader()->loadedUrls)->toHaveCount(6) + ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/main') + ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1') + ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1/sub1') + ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2') + ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1') + ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1/sub1'); }); it('crawls only to a certain depth when the crawl depth is defined', function () { @@ -254,13 +248,10 @@ function () { $crawler->runAndTraverse(); - expect($crawler->getLoader()->loadedUrls)->toHaveCount(3); - - expect($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sitemap.xml'); - - expect($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1'); - - expect($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1/sub1'); + expect($crawler->getLoader()->loadedUrls)->toHaveCount(3) + ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sitemap.xml') + ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1') + ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1/sub1'); }); it('loads only URLs where the path matches a regex when method pathMatches() is used', function () { @@ -294,13 +285,10 @@ function () { $crawler->runAndTraverse(); - expect($crawler->getLoader()->loadedUrls)->toHaveCount(4); - - expect($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/main'); - - expect($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1/sub1'); - - expect($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1/sub1'); + expect($crawler->getLoader()->loadedUrls)->toHaveCount(4) + ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/main') + ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1/sub1') + ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1/sub1'); }); it( @@ -311,22 +299,18 @@ function () { ->input('http://www.example.com/crawling/main') ->addStep( Http::crawl() - ->customFilter(function (Url $url, ?\Symfony\Component\DomCrawler\Crawler $linkElement) { + ->customFilter(function (Url $url, ?HtmlElement $linkElement) { return $linkElement && str_contains($linkElement->text(), 'Subpage 2'); }), ); $crawler->runAndTraverse(); - expect($crawler->getLoader()->loadedUrls)->toHaveCount(4); - - expect($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/main'); - - expect($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2'); - - expect($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1'); - - expect($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1/sub1'); + expect($crawler->getLoader()->loadedUrls)->toHaveCount(4) + ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/main') + ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2') + ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1') + ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1/sub1'); }, ); @@ -345,9 +329,8 @@ function () { $results = helper_generatorToArray($crawler->run()); - expect($crawler->getLoader()->loadedUrls)->toHaveCount(7); - - expect($results)->toHaveCount(3); + expect($crawler->getLoader()->loadedUrls)->toHaveCount(7) + ->and($results)->toHaveCount(3); }, ); @@ -366,9 +349,8 @@ function () { $results = helper_generatorToArray($crawler->run()); - expect($crawler->getLoader()->loadedUrls)->toHaveCount(7); - - expect($results)->toHaveCount(2); + expect($crawler->getLoader()->loadedUrls)->toHaveCount(7) + ->and($results)->toHaveCount(2); }, ); @@ -393,9 +375,8 @@ function () { $results = helper_generatorToArray($crawler->run()); - expect($crawler->getLoader()->loadedUrls)->toHaveCount(7); - - expect($results)->toHaveCount(3); + expect($crawler->getLoader()->loadedUrls)->toHaveCount(7) + ->and($results)->toHaveCount(3); }, ); diff --git a/tests/_Integration/Http/HeadlessBrowserTest.php b/tests/_Integration/Http/HeadlessBrowserTest.php index 4c00153..a818a42 100644 --- a/tests/_Integration/Http/HeadlessBrowserTest.php +++ b/tests/_Integration/Http/HeadlessBrowserTest.php @@ -7,6 +7,7 @@ use Crwlr\Crawler\Loader\Http\Cookies\CookieJar; use Crwlr\Crawler\Loader\Http\HttpLoader; use Crwlr\Crawler\Loader\LoaderInterface; +use Crwlr\Crawler\Steps\Dom\HtmlDocument; use Crwlr\Crawler\Steps\Html; use Crwlr\Crawler\Steps\Loading\Http; use Crwlr\Crawler\Steps\Loading\Http\Browser\BrowserAction; @@ -15,7 +16,6 @@ use Crwlr\Crawler\UserAgents\UserAgentInterface; use Generator; use Psr\Log\LoggerInterface; -use Symfony\Component\DomCrawler\Crawler; use function tests\helper_generatorToArray; use function tests\helper_getFastLoader; @@ -43,7 +43,7 @@ protected function invoke(mixed $input): Generator { $html = Http::getBodyString($input->response); - $jsonString = (new Crawler($html))->filter('body pre')->text(); + $jsonString = (new HtmlDocument($html))->querySelector('body pre')?->text() ?? ''; yield json_decode($jsonString, true); } @@ -55,7 +55,7 @@ protected function invoke(mixed $input): Generator { $html = Http::getBodyString($input->response); - yield (new Crawler($html))->filter('body')->text(); + yield (new HtmlDocument($html))->querySelector('body')?->text() ?? ''; } }