From d844c240576381bce7409edd5724dd6eb93d371c Mon Sep 17 00:00:00 2001 From: Simon Asika Date: Mon, 9 Dec 2024 12:51:36 +0800 Subject: [PATCH 1/4] php 8.4 new dom output --- src/HTML5/Serializer/OutputRules.php | 77 +++++++++++++++++++++++----- src/HTML5/Serializer/Traverser.php | 10 ++-- 2 files changed, 72 insertions(+), 15 deletions(-) diff --git a/src/HTML5/Serializer/OutputRules.php b/src/HTML5/Serializer/OutputRules.php index ec467f22..0d712f68 100644 --- a/src/HTML5/Serializer/OutputRules.php +++ b/src/HTML5/Serializer/OutputRules.php @@ -9,6 +9,12 @@ namespace Masterminds\HTML5\Serializer; +use Dom\Attr; +use Dom\CharacterData; +use Dom\Document; +use Dom\Element; +use Dom\Node; +use Dom\XPath; use Masterminds\HTML5\Elements; /** @@ -229,9 +235,9 @@ public function element($ele) $this->openTag($ele); if (Elements::isA($name, Elements::TEXT_RAW)) { foreach ($ele->childNodes as $child) { - if ($child instanceof \DOMCharacterData) { + if ($child instanceof \DOMCharacterData || $child instanceof CharacterData) { $this->wr($child->data); - } elseif ($child instanceof \DOMElement) { + } elseif ($child instanceof \DOMElement || $child instanceof Element) { $this->element($child); } } @@ -299,13 +305,21 @@ public function processorInstruction($ele) */ protected function namespaceAttrs($ele) { - if (!$this->xpath || $this->xpath->document !== $ele->ownerDocument) { - $this->xpath = new \DOMXPath($ele->ownerDocument); - } + $isLegacyDocument = static::isLegacyDocument($ele); + + // Finding namespace in new \Dom\Document will cause error message: + // DOMException: The namespace axis is not well-defined in the living DOM specification. + // Use Dom\Element::getInScopeNamespaces() or Dom\Element::getDescendantNamespaces() instead. + if ($isLegacyDocument) { + // TODO: Fix the namespace attrs writing. + if (!$this->xpath || $this->xpath->document !== $ele->ownerDocument) { + $this->xpath = new \DOMXPath($ele->ownerDocument); + } - foreach ($this->xpath->query('namespace::*[not(.=../../namespace::*)]', $ele) as $nsNode) { - if (!in_array($nsNode->nodeValue, $this->implicitNamespaces)) { - $this->wr(' ')->wr($nsNode->nodeName)->wr('="')->wr($nsNode->nodeValue)->wr('"'); + foreach ($this->xpath->query('namespace::*[not(.=../../namespace::*)]', $ele) as $nsNode) { + if (!in_array($nsNode->nodeValue, $this->implicitNamespaces)) { + $this->wr(' ')->wr($nsNode->nodeName)->wr('="')->wr($nsNode->nodeValue)->wr('"'); + } } } } @@ -375,8 +389,14 @@ protected function attrs($ele) } } - protected function nonBooleanAttribute(\DOMAttr $attr) + protected function nonBooleanAttribute($attr) { + if (!$attr instanceof \DOMAttr && !$attr instanceof Attr) { + throw new \InvalidArgumentException( + __METHOD__ . '() argument 1 should be \DOMAttr or \Dom\Attr' + ); + } + $ele = $attr->ownerElement; foreach ($this->nonBooleanAttributes as $rule) { if (isset($rule['nodeNamespace']) && $rule['nodeNamespace'] !== $ele->namespaceURI) { @@ -415,10 +435,25 @@ protected function nonBooleanAttribute(\DOMAttr $attr) return false; } - private function getXPath(\DOMNode $node) + /** + * @param Node|\DOMNode $node + * + * @return XPath|\DOMXPath + */ + private function getXPath($node) { + $isLegacyDocument = static::isLegacyDocument($node); + + if ($isLegacyDocument) { + if (!$this->xpath) { + $this->xpath = new \DOMXPath($node->ownerDocument); + } + + return $this->xpath; + } + if (!$this->xpath) { - $this->xpath = new \DOMXPath($node->ownerDocument); + $this->xpath = new XPath($node->ownerDocument); } return $this->xpath; @@ -430,7 +465,7 @@ private function getXPath(\DOMNode $node) * Tags for HTML, MathML, and SVG are in the local name. Otherwise, use the * qualified name (8.3). * - * @param \DOMNode $ele The element being written. + * @param Node|\DOMNode $ele The element being written. */ protected function closeTag($ele) { @@ -550,4 +585,22 @@ protected function escape($text, $attribute = false) return strtr($text, $replace); } + + /** + * @param Node|\DOMNode $node + * + * @return bool + */ + protected static function isLegacyDocument($node) + { + if ($node instanceof Document) { + return false; + } + + if ($node instanceof \DOMDocument) { + return true; + } + + return $node->ownerDocument instanceof \DOMDocument; + } } diff --git a/src/HTML5/Serializer/Traverser.php b/src/HTML5/Serializer/Traverser.php index 1e8d7924..6814452c 100644 --- a/src/HTML5/Serializer/Traverser.php +++ b/src/HTML5/Serializer/Traverser.php @@ -2,6 +2,10 @@ namespace Masterminds\HTML5\Serializer; +use Dom\Document; +use Dom\DocumentFragment; +use Dom\NodeList; + /** * Traverser for walking a DOM tree. * @@ -60,16 +64,16 @@ public function __construct($dom, $out, RulesInterface $rules, $options = array( */ public function walk() { - if ($this->dom instanceof \DOMDocument) { + if ($this->dom instanceof \DOMDocument || $this->dom instanceof Document) { $this->rules->document($this->dom); - } elseif ($this->dom instanceof \DOMDocumentFragment) { + } elseif ($this->dom instanceof \DOMDocumentFragment || $this->dom instanceof DocumentFragment) { // Document fragments are a special case. Only the children need to // be serialized. if ($this->dom->hasChildNodes()) { $this->children($this->dom->childNodes); } } // If NodeList, loop - elseif ($this->dom instanceof \DOMNodeList) { + elseif ($this->dom instanceof \DOMNodeList || $this->dom instanceof NodeList) { // If this is a NodeList of DOMDocuments this will not work. $this->children($this->dom); } // Else assume this is a DOMNode-like datastructure. From c35c923c9f8b55a1da09e7f48c3d0f522d1d9148 Mon Sep 17 00:00:00 2001 From: Simon Asika Date: Mon, 9 Dec 2024 12:54:33 +0800 Subject: [PATCH 2/4] Add AllowDynamicProperties attributes --- src/HTML5/Parser/Scanner.php | 1 + test/HTML5/Serializer/TraverserTest.php | 1 + 2 files changed, 2 insertions(+) diff --git a/src/HTML5/Parser/Scanner.php b/src/HTML5/Parser/Scanner.php index 1b25888b..5b960259 100644 --- a/src/HTML5/Parser/Scanner.php +++ b/src/HTML5/Parser/Scanner.php @@ -7,6 +7,7 @@ /** * The scanner scans over a given data input to react appropriately to characters. */ +#[\AllowDynamicProperties] class Scanner { const CHARS_HEX = 'abcdefABCDEF01234567890'; diff --git a/test/HTML5/Serializer/TraverserTest.php b/test/HTML5/Serializer/TraverserTest.php index 81dd4444..5183f411 100644 --- a/test/HTML5/Serializer/TraverserTest.php +++ b/test/HTML5/Serializer/TraverserTest.php @@ -5,6 +5,7 @@ use Masterminds\HTML5\Serializer\OutputRules; use Masterminds\HTML5\Serializer\Traverser; +#[\AllowDynamicProperties] class TraverserTest extends \Masterminds\HTML5\Tests\TestCase { protected $markup = ' From b79dbb5d00b5794410a3966180d017e1526ad12d Mon Sep 17 00:00:00 2001 From: Simon Asika Date: Mon, 9 Dec 2024 13:24:23 +0800 Subject: [PATCH 3/4] Test for php 8.4 --- .../Serializer/OutputRulesNewDomTest.php | 639 ++++++++++++++++++ 1 file changed, 639 insertions(+) create mode 100644 test/HTML5/Serializer/OutputRulesNewDomTest.php diff --git a/test/HTML5/Serializer/OutputRulesNewDomTest.php b/test/HTML5/Serializer/OutputRulesNewDomTest.php new file mode 100644 index 00000000..b8d99274 --- /dev/null +++ b/test/HTML5/Serializer/OutputRulesNewDomTest.php @@ -0,0 +1,639 @@ + + + + + Test + + +

This is a test.

+ + '; + + /** + * @var HTML5 + */ + protected $html5; + + public static function setUpBeforeClass(): void + { + if (PHP_VERSION_ID < 80400) { + self::markTestSkipped('New DOM only supports PHP 8.4+'); + } + + parent::setUpBeforeClass(); + } + + /** + * @before + */ + public function before() + { + $this->html5 = $this->getInstance(); + } + + public function loadHTML($html) + { + return HTMLDocument::createFromString($html); + } + + /** + * Using reflection we make a protected method accessible for testing. + * + * @param string $name + * The name of the method on the Traverser class to test + * + * @return \ReflectionMethod for the specified method + */ + public function getProtectedMethod($name) + { + $class = new \ReflectionClass('\Masterminds\HTML5\Serializer\OutputRules'); + $method = $class->getMethod($name); + $method->setAccessible(true); + + return $method; + } + + public function getTraverserProtectedProperty($name) + { + $class = new \ReflectionClass('\Masterminds\HTML5\Serializer\Traverser'); + $property = $class->getProperty($name); + $property->setAccessible(true); + + return $property; + } + + public function getOutputRules($options = array()) + { + $options = $options + $this->html5->getOptions(); + $stream = fopen('php://temp', 'w'); + $dom = $this->loadHTML($this->markup); + $r = new OutputRules($stream, $options); + $t = new Traverser($dom, $stream, $r, $options); + + return array( + $r, + $stream, + ); + } + + public function testDocument() + { + $dom = $this->loadHTML('foo'); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $r->document($dom); + $expected = '' . PHP_EOL . 'foo' . PHP_EOL; + $this->assertEquals($expected, stream_get_contents($stream, -1, 0)); + } + + public function testEmptyDocument() + { + $dom = $this->loadHTML(''); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $r->document($dom); + $expected = '' . PHP_EOL . '' . PHP_EOL; + $this->assertEquals($expected, stream_get_contents($stream, -1, 0)); + } + + public function testDoctype() + { + $dom = $this->loadHTML('foo'); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $m = $this->getProtectedMethod('doctype'); + $m->invoke($r, 'foo'); + $this->assertEquals('' . PHP_EOL, stream_get_contents($stream, -1, 0)); + } + + public function testElement() + { + $dom = $this->loadHTML( + ' + + +
foo bar baz
+ + + + + + + '); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $list = $dom->getElementsByTagName('div'); + $r->element($list->item(0)); + $this->assertEquals('
foo bar baz
', stream_get_contents($stream, -1, 0)); + } + + public function testSerializeWithNamespaces() + { + $this->html5 = $this->getInstance(array( + 'xmlNamespaces' => true, + )); + + $source = ' + + + + xy + + svg + +
+ + y + '; + + $dom = $this->loadHTML($source, array( + 'xmlNamespaces' => true, + )); + $this->assertFalse($this->html5->hasErrors(), print_r($this->html5->getErrors(), 1)); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $t->walk(); + $rendered = stream_get_contents($stream, -1, 0); + + $clear = function ($s) { + return trim(preg_replace('/[\s]+/', ' ', $s)); + }; + + $this->assertEquals($clear($source), $clear($rendered)); + } + + public function testElementWithScript() + { + $dom = $this->loadHTML( + ' + + + + + +
foo bar baz
+ + '); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $script = $dom->getElementsByTagName('script'); + $r->element($script->item(0)); + $this->assertEquals( + '', stream_get_contents($stream, -1, 0)); + } + + public function testElementWithStyle() + { + $dom = $this->loadHTML( + ' + + + + + +
foo bar baz
+ + '); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $style = $dom->getElementsByTagName('style'); + $r->element($style->item(0)); + $this->assertEquals('', stream_get_contents($stream, -1, 0)); + } + + public function testOpenTag() + { + $dom = $this->loadHTML(' + + +
foo bar baz
+ + '); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $list = $dom->getElementsByTagName('div'); + $m = $this->getProtectedMethod('openTag'); + $m->invoke($r, $list->item(0)); + $this->assertEquals('
', stream_get_contents($stream, -1, 0)); + } + + public function testComment() + { + $dom = $this->loadHTML(' + + +
+ + '); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $list = $dom->getElementsByTagName('div'); + $r->comment($list->item(0)->childNodes->item(0)); + $this->assertEquals('', stream_get_contents($stream, -1, 0)); + + $dom = $this->loadHTML(' + + +
+ + '); + $dom->getElementById('foo')->appendChild($dom->createComment(' --> Foo -->')); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $list = $dom->getElementsByTagName('div'); + $r->comment($list->item(0)->childNodes->item(0)); + + // Could not find more definitive guidelines on what this should be. Went with + // what the HTML5 spec says and what \DOMDocument::saveXML() produces. + $this->assertEquals(' --> Foo -->-->', stream_get_contents($stream, -1, 0)); + } + + public function testText() + { + $dom = $this->loadHTML(' + + + + + '); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $list = $dom->getElementsByTagName('script'); + $r->text($list->item(0)->childNodes->item(0)); + $this->assertEquals('baz();', stream_get_contents($stream, -1, 0)); + + $dom = $this->loadHTML(' + + + '); + $foo = $dom->getElementById('foo'); + $foo->appendChild($dom->createTextNode('')); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $r->text($foo->firstChild); + $this->assertEquals('<script>alert("hi");</script>', stream_get_contents($stream, -1, 0)); + } + + public function testNl() + { + [$o, $s] = $this->getOutputRules(); + + $m = $this->getProtectedMethod('nl'); + $m->invoke($o); + $this->assertEquals(PHP_EOL, stream_get_contents($s, -1, 0)); + } + + public function testWr() + { + [$o, $s] = $this->getOutputRules(); + + $m = $this->getProtectedMethod('wr'); + $m->invoke($o, 'foo'); + $this->assertEquals('foo', stream_get_contents($s, -1, 0)); + } + + public function getEncData() + { + return array( + array( + false, + '&\'<>"', + '&\'<>"', + '&'<>"', + ), + array( + false, + 'This + is. a < test', + 'This + is. a < test', + 'This + is. a < test', + ), + array( + false, + '.+#', + '.+#', + '.+#', + ), + + array( + true, + '.+#\'', + '.+#\'', + '.+#'', + ), + array( + true, + '&".<', + '&".<', + '&".<', + ), + array( + true, + '&\'<>"', + '&\'<>"', + '&'<>"', + ), + array( + true, + "\xc2\xa0\"'", + ' "\'', + ' "'', + ), + ); + } + + /** + * Test basic encoding of text. + * + * @dataProvider getEncData + */ + public function testEnc($isAttribute, $test, $expected, $expectedEncoded) + { + [$o, $s] = $this->getOutputRules(); + $m = $this->getProtectedMethod('enc'); + + $this->assertEquals($expected, $m->invoke($o, $test, $isAttribute)); + + [$o, $s] = $this->getOutputRules(array( + 'encode_entities' => true, + )); + $m = $this->getProtectedMethod('enc'); + $this->assertEquals($expectedEncoded, $m->invoke($o, $test, $isAttribute)); + } + + /** + * Test basic encoding of text. + * + * @dataProvider getEncData + */ + public function testEscape($isAttribute, $test, $expected, $expectedEncoded) + { + [$o, $s] = $this->getOutputRules(); + $m = $this->getProtectedMethod('escape'); + + $this->assertEquals($expected, $m->invoke($o, $test, $isAttribute)); + } + + public function booleanAttributes() + { + return array( + array(''), + array(''), + array(''), + array(''), + array(''), + array(''), + array('
'), + array(''), + array('
'), + array(''), + ); + } + + /** + * @dataProvider booleanAttributes + */ + public function testBooleanAttrs($html) + { + $dom = $this->loadHTML('' . $html . ''); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $node = $dom->getElementsByTagName('body')->item(0)->firstChild; + + $m = $this->getProtectedMethod('attrs'); + $m->invoke($r, $node); + + $content = stream_get_contents($stream, -1, 0); + + $html = preg_replace('~<[a-z]+(.*)>~', '\1', $html); + $html = preg_replace('~<[a-z]+(.*)/?>~', '\1', $html); + + $this->assertEquals($content, $html); + } + + public function testAttrs() + { + $dom = $this->loadHTML(' + + +
foo bar baz
+ + '); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $list = $dom->getElementsByTagName('div'); + + $m = $this->getProtectedMethod('attrs'); + $m->invoke($r, $list->item(0)); + + $content = stream_get_contents($stream, -1, 0); + $this->assertEquals(' id="foo" class="bar baz"', $content); + } + + public function testSvg() + { + $dom = $this->loadHTML( + ' + + +
foo bar baz
+ + + + + + + + + + '); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $list = $dom->getElementsByTagName('svg'); + $r->element($list->item(0)); + $contents = stream_get_contents($stream, -1, 0); + $this->assertRegExp('||', $contents); + $this->assertRegExp('||', $contents); + $this->assertRegExp('||', $contents); + } + + public function testMath() + { + $dom = $this->loadHTML( + ' + + +
foo bar baz
+ + x + + ± + + y + + + '); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $list = $dom->getElementsByTagName('math'); + $r->element($list->item(0)); + $content = stream_get_contents($stream, -1, 0); + $this->assertRegExp('||', $content); + $this->assertRegExp('||', $content); + } + + public function testProcessorInstruction() + { + $doc = HTMLDocument::createEmpty(); + $dom = $doc->createProcessingInstruction('foo', 'bar '); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $r->processorInstruction($dom); + $content = stream_get_contents($stream, -1, 0); + $this->assertRegExp('|<\?foo bar \?>|', $content); + } + + public function testAddressTag() + { + $dom = $this->loadHTML( + ' + + +
+ Dave Raggett, + Arnaud Le Hors, + contact persons for the W3C HTML Activity +
+ + '); + + $stream = fopen('php://temp', 'w'); + $r = new OutputRules($stream, $this->html5->getOptions()); + $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); + + $list = $dom->getElementsByTagName('address'); + $r->element($list->item(0)); + $contents = stream_get_contents($stream, -1, 0); + + $this->assertRegExp('|
|', $contents); + $this->assertRegExp('|Dave Raggett,|', $contents); + $this->assertRegExp('|Arnaud Le Hors,|', $contents); + $this->assertRegExp('|contact persons for the W3C HTML Activity|', $contents); + $this->assertRegExp('|
|', $contents); + } + + /** + * Ensure direct DOM manipulation doesn't break TEXT_RAW elements (iframe, script, etc...). + */ + public function testHandlingInvalidRawContent() + { + self::markTestSkipped('Currently \Dom\HTMLElement will break invalid HTML so skip this test.'); + + $dom = $this->loadHTML( + ' + + + + +'); + + $badNode = $dom->createElement('p'); + $badNode->textContent = 'Bar'; + + // modify the content of the TEXT_RAW element: ')); + } +} From 310e3f5703bab22b629634de5b92c081a5dd135a Mon Sep 17 00:00:00 2001 From: Simon Asika Date: Mon, 9 Dec 2024 13:39:47 +0800 Subject: [PATCH 4/4] Use LIBXML_HTML_NOIMPLIED --- test/HTML5/Serializer/OutputRulesNewDomTest.php | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/HTML5/Serializer/OutputRulesNewDomTest.php b/test/HTML5/Serializer/OutputRulesNewDomTest.php index b8d99274..858c0571 100644 --- a/test/HTML5/Serializer/OutputRulesNewDomTest.php +++ b/test/HTML5/Serializer/OutputRulesNewDomTest.php @@ -45,7 +45,10 @@ public function before() public function loadHTML($html) { - return HTMLDocument::createFromString($html); + return HTMLDocument::createFromString( + $html, + LIBXML_HTML_NOIMPLIED + ); } /** @@ -97,7 +100,7 @@ public function testDocument() $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); $r->document($dom); - $expected = '' . PHP_EOL . 'foo' . PHP_EOL; + $expected = '' . PHP_EOL . 'foo' . PHP_EOL; $this->assertEquals($expected, stream_get_contents($stream, -1, 0)); } @@ -110,7 +113,7 @@ public function testEmptyDocument() $t = new Traverser($dom, $stream, $r, $this->html5->getOptions()); $r->document($dom); - $expected = '' . PHP_EOL . '' . PHP_EOL; + $expected = '' . PHP_EOL; $this->assertEquals($expected, stream_get_contents($stream, -1, 0)); }