Skip to content

Commit

Permalink
HTML API: Include doctype in full parser serialize.
Browse files Browse the repository at this point in the history
Output DOCTYPE when calling `WP_HTML_Processor::serialize` on a full document that includes a DOCTYPE.

The DOCTYPE should be included in the serialized/normalized HTML output as it has an impact in how the document is handled, in particular whether the document should be handled in quirks or no-quirks mode.

This only affects the serialization of full parsers at this time because DOCTYPE tokens are currently ignored in all possible fragments. The omission of the DOCTYPE is subtle but can change the serialized document's quirks/no-quirks mode.

Props jonsurrell.
Fixes #62396.

git-svn-id: https://develop.svn.wordpress.org/trunk@59399 602fd350-edb4-49c9-b593-d223f7449a82
  • Loading branch information
ockham committed Nov 13, 2024
1 parent db2f6fe commit 1a27422
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 4 deletions.
28 changes: 24 additions & 4 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -1178,6 +1178,30 @@ protected function serialize_token(): string {
$token_type = $this->get_token_type();

switch ( $token_type ) {
case '#doctype':
$doctype = $this->get_doctype_info();
if ( null === $doctype ) {
break;
}

$html .= '<!DOCTYPE';

if ( $doctype->name ) {
$html .= " {$doctype->name}";
}

if ( null !== $doctype->public_identifier ) {
$html .= " PUBLIC \"{$doctype->public_identifier}\"";
}
if ( null !== $doctype->system_identifier ) {
if ( null === $doctype->public_identifier ) {
$html .= ' SYSTEM';
}
$html .= " \"{$doctype->system_identifier}\"";
}
$html .= '>';
break;

case '#text':
$html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
break;
Expand All @@ -1194,10 +1218,6 @@ protected function serialize_token(): string {
case '#cdata-section':
$html .= "<![CDATA[{$this->get_modifiable_text()}]]>";
break;

case 'html':
$html .= '<!DOCTYPE html>';
break;
}

if ( '#tag' !== $token_type ) {
Expand Down
33 changes: 33 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php
Original file line number Diff line number Diff line change
Expand Up @@ -284,4 +284,37 @@ public static function data_tokens_with_null_bytes() {
'Comment text' => array( "<!-- \x00 -->", "<!-- \u{FFFD} -->" ),
);
}

/**
* @ticket 62396
*
* @dataProvider data_provider_serialize_doctype
*/
public function test_full_document_serialize_includes_doctype( string $doctype_input, string $doctype_output ) {
$processor = WP_HTML_Processor::create_full_parser(
"{$doctype_input}👌"
);
$this->assertSame(
"{$doctype_output}<html><head></head><body>👌</body></html>",
$processor->serialize()
);
}

/**
* Data provider.
*
* @return array[]
*/
public static function data_provider_serialize_doctype() {
return array(
'None' => array( '', '' ),
'Empty' => array( '<!DOCTYPE>', '<!DOCTYPE>' ),
'HTML5' => array( '<!DOCTYPE html>', '<!DOCTYPE html>' ),
'Strange name' => array( '<!DOCTYPE WordPress>', '<!DOCTYPE wordpress>' ),
'With public' => array( '<!DOCTYPE html PUBLIC "x">', '<!DOCTYPE html PUBLIC "x">' ),
'With system' => array( '<!DOCTYPE html SYSTEM "y">', '<!DOCTYPE html SYSTEM "y">' ),
'With public and system' => array( '<!DOCTYPE html PUBLIC "x" "y">', '<!DOCTYPE html PUBLIC "x" "y">' ),
'Weird casing' => array( '<!docType HtmL pubLIc\'xxx\'"yyy" all this is ignored>', '<!DOCTYPE html PUBLIC "xxx" "yyy">' ),
);
}
}

0 comments on commit 1a27422

Please sign in to comment.