diff --git a/CHANGELOG.md b/CHANGELOG.md index 45ff5f0..dd7ffc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [3.0.2] - 2024-12-11 +### Fixed +* When the new PHP 8.4 DOM API is used, and HTML declared as UTF-8 contains non UTF-8 compatible characters, it does not replace them with a � character, but instead removes it. This behaviour is consistent with the data returned by Symfony DomCrawler. + ## [3.0.1] - 2024-12-10 ### Undeprecated * Removed deprecations for all XPath functionality (`Dom::xPath()`, `XPathQuery` class and `Node::queryXPath()`), because it's still available with the net DOM API in PHP 8.4. diff --git a/src/Steps/Html/DomQuery.php b/src/Steps/Html/DomQuery.php index be82d3f..c8da0db 100644 --- a/src/Steps/Html/DomQuery.php +++ b/src/Steps/Html/DomQuery.php @@ -303,6 +303,10 @@ protected function getTarget(HtmlElement|XmlElement $node): string $target = $this->handleUrlFragment(Url::parse($this->baseUrl)->resolve($target)); } + if (str_contains($target, '�')) { + $target = str_replace('�', '', $target); + } + return $target; } diff --git a/tests/_Integration/Http/CharsetTest.php b/tests/_Integration/Http/CharsetTest.php new file mode 100644 index 0000000..be9102a --- /dev/null +++ b/tests/_Integration/Http/CharsetTest.php @@ -0,0 +1,41 @@ +input('http://localhost:8000/non-utf-8-charset') + ->addStep(Http::get()) + ->addStep(Html::root()->extract(['foo' => '.element'])); + + $results = helper_generatorToArray($crawler->run()); + + expect($results)->toHaveCount(1) + ->and($results[0]->toArray())->toBe(['foo' => '0 l/m']); +}); diff --git a/tests/_Integration/Server.php b/tests/_Integration/Server.php index 0881da6..c063f76 100644 --- a/tests/_Integration/Server.php +++ b/tests/_Integration/Server.php @@ -204,3 +204,7 @@ function getParamAfter(string $route, string $after): string header('Location: http://localhost:8000/redirect?no=' . ($redirectNo + 1) . $stopAt); } + +if (str_starts_with($route, '/non-utf-8-charset')) { + return include(__DIR__ . '/_Server/NonUtf8.php'); +} diff --git a/tests/_Integration/_Server/NonUtf8.php b/tests/_Integration/_Server/NonUtf8.php new file mode 100644 index 0000000..61c9ca2 --- /dev/null +++ b/tests/_Integration/_Server/NonUtf8.php @@ -0,0 +1,22 @@ + + + + + + Non UTF-8 charset page + + +
+ +
+ +