Skip to content

Commit

Permalink
Fix non UTF-8 char issue
Browse files Browse the repository at this point in the history
When the new PHP 8.4 DOM API is used, and HTML declared as UTF-8
contains non UTF-8 compatible characters, it does not replace them
with a � character, but instead removes it. This behaviour is consistent
with the data returned by Symfony DomCrawler.
  • Loading branch information
otsch committed Dec 11, 2024
1 parent b10364e commit 1993669
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 0 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [3.0.2] - 2024-12-11
### Fixed
* When the new PHP 8.4 DOM API is used, and HTML declared as UTF-8 contains non UTF-8 compatible characters, it does not replace them with a � character, but instead removes it. This behaviour is consistent with the data returned by Symfony DomCrawler.

## [3.0.1] - 2024-12-10
### Undeprecated
* Removed deprecations for all XPath functionality (`Dom::xPath()`, `XPathQuery` class and `Node::queryXPath()`), because it's still available with the net DOM API in PHP 8.4.
Expand Down
4 changes: 4 additions & 0 deletions src/Steps/Html/DomQuery.php
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,10 @@ protected function getTarget(HtmlElement|XmlElement $node): string
$target = $this->handleUrlFragment(Url::parse($this->baseUrl)->resolve($target));
}

if (str_contains($target, '')) {
$target = str_replace('', '', $target);
}

return $target;
}

Expand Down
41 changes: 41 additions & 0 deletions tests/_Integration/Http/CharsetTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
<?php

namespace tests\_Integration\Http;

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Steps\Html;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Psr\Log\LoggerInterface;

use function tests\helper_generatorToArray;
use function tests\helper_getFastLoader;

class CharsetExampleCrawler extends HttpCrawler
{
public function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
{
return helper_getFastLoader($userAgent, $logger);
}

protected function userAgent(): UserAgentInterface
{
return new UserAgent('SomeUserAgent');
}
}

it('removes (and not replaces with broken ? replacement char) non utf-8 characters from extracted data', function () {
$crawler = new CharsetExampleCrawler();

$crawler
->input('http://localhost:8000/non-utf-8-charset')
->addStep(Http::get())
->addStep(Html::root()->extract(['foo' => '.element']));

$results = helper_generatorToArray($crawler->run());

expect($results)->toHaveCount(1)
->and($results[0]->toArray())->toBe(['foo' => '0 l/m']);
});
4 changes: 4 additions & 0 deletions tests/_Integration/Server.php
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,7 @@ function getParamAfter(string $route, string $after): string

header('Location: http://localhost:8000/redirect?no=' . ($redirectNo + 1) . $stopAt);
}

if (str_starts_with($route, '/non-utf-8-charset')) {
return include(__DIR__ . '/_Server/NonUtf8.php');
}
22 changes: 22 additions & 0 deletions tests/_Integration/_Server/NonUtf8.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<!Doctype html>
<html>
<head>
<meta charset="UTF-8">
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>Non UTF-8 charset page</title>
</head>
<body>
<div class="element">
<?php
$string = '';

// 178 is square (² in ISO-8859-1) but broken in UTF-8
foreach ([48, 32, 108, 47, 109, 178] as $ord) {
$string .= chr($ord);
}

echo $string;
?>
</div>
</body>
</html>

0 comments on commit 1993669

Please sign in to comment.