From e6dc26d6c444cf7d44b1538057bd1cd7ac799a75 Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Mon, 30 Jan 2023 17:21:51 +0100 Subject: [PATCH 01/18] HTML Processor: Add from #46345 --- .../html/class-wp-html-processor.php | 215 +++++++++++++++++ phpunit/html/wp-html-processor-test.php | 225 ++++++++++++++++++ 2 files changed, 440 insertions(+) create mode 100644 lib/experimental/html/class-wp-html-processor.php create mode 100644 phpunit/html/wp-html-processor-test.php diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php new file mode 100644 index 0000000000000..6fa236aa94a33 --- /dev/null +++ b/lib/experimental/html/class-wp-html-processor.php @@ -0,0 +1,215 @@ +` but not clear to how + * handle `

` given that `` is a formatting element but `

` is + * not, that `

` itself is a special element. + */ + + +class WP_HTML_Processor_Scan_State { + public $budget = 1000; + public $open_tags = array(); + public $match_depth = null; + + public function relative_depth() { + return count( $this->open_tags ); + } +} + + +class WP_HTML_Processor extends WP_HTML_Tag_Processor { + public function new_state() { + $state = new WP_HTML_Processor_Scan_State(); + $tag_name = $this->get_tag(); + + if ( ! self::is_html_void_element( $tag_name ) && ! $this->is_tag_closer() ) { + $state->open_tags[] = $tag_name; + } + + return $state; + } + + public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = null ) { + while ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) && $state->budget-- > 0 ) { + $tag_name = $this->get_tag(); + $is_closer = $this->is_tag_closer(); + $is_void = self::is_html_void_element( $tag_name ); + $type = self::classify_tag_type( $is_closer, $is_void ); + + /* + * Step 1. Update the stack of open tags. + * + * If and when we add more complete HTML parsing support we will also + * need to track the stack of active formats so that we can properly + * handle missing tags and overlapping tags. + */ + + switch ( $type ) { + case 'void': + /* + * Void tags (such as ) can't have children and so we + * won't push or pop them from the stack of open tags. + * + * If and when we support self-closing foreign tags we would + * need to separately track those, but their behavior matches + * this case. The self-closing flag is ignored for HTML5 tags. + */ + if ( 0 === $state->relative_depth() ) { + return false; + } + + break; + + case 'opener': + $state->open_tags[] = $tag_name; + break; + + case 'closer': + $last_tag = array_pop( $state->open_tags ); + + /* + * Currently we can only support fully-normative and balanced HTML5. + * If we encounter anything we don't expect then we will bail. In a + * future update we may perform more careful HTML parsing and unlock + * navigating through non-normative documents. + */ + if ( $last_tag !== $tag_name ) { + return false; + } + + /* + * Step 2. Bail if we've reached the end of the tag in which we started. + */ + if ( 0 === $state->relative_depth() ) { + return false; + } + + break; + } + + /* + * Void elements don't enter the stack, but they do exist in the + * depth hierarchy, so we have to temporarily account for that. + * + * We could have followed the approach in the HTML5 spec by appending + * the void tag to the stack of open tags, and then remember to pop it + * when existing this function, but by tracking it like this we don't + * have to remember to do that. + */ + $depth = $type === 'void' + ? $state->relative_depth() + 1 + : $state->relative_depth(); + + /* + * Step 3. Determine if we have a matching tag. In addition to the query + * we pass along to the underlying tag processor we're going to allow + * specifying the relative depth for a match. For example, a CSS child + * combinator would specify that a match must have a relative depth of 1, + * indicating that it's a direct child of the surrounding element, whereas + * the descendant selector could match at any depth and so sets this to `null`. + * To prevent matching _above_ a tag we rely on the `bail_depth` to stop + * searching once we've exited the tag on which we started, or reach its parent. + */ + + if ( ! isset( $state->match_depth ) || $state->match_depth + 1 === $depth ) { + $this->parse_query( $query ); + if ( $this->matches() ) { + return true; + } + } + } + + return false; + } + + public function get_content_inside_balanced_tags() { + static $start_name = null; + static $end_name = null; + + if ( null === $start_name || array_key_exists( $start_name, $this->bookmarks ) ) { + $rand_id = rand( 1, PHP_INT_MAX ); + $start_name = "start_{$rand_id}"; + } + + if ( null === $end_name || array_key_exists( $end_name, $this->bookmarks ) ) { + $rand_id = rand( 1, PHP_INT_MAX ); + $end_name = "start_{$rand_id}"; + } + + $this->set_bookmark( $start_name ); + + $state = self::new_state(); + while ( $this->balanced_next( $state ) ) { + continue; + } + + $this->set_bookmark( $end_name ); + $content = $this->content_inside_bookmarks( $start_name, $end_name ); + $this->seek( $start_name ); + + $this->release_bookmark( $start_name ); + $this->release_bookmark( $end_name ); + + return $content; + } + + private function content_inside_bookmarks( $start_bookmark, $end_bookmark ) { + if ( ! isset( $this->bookmarks[ $start_bookmark ], $this->bookmarks[ $end_bookmark ] ) ) { + return null; + } + + $start = $this->bookmarks[ $start_bookmark ]; + $end = $this->bookmarks[ $end_bookmark ]; + + return substr( $this->get_updated_html(), $start->end + 1, $end->start - $start->end - 2 ); + } + + /* + * HTML-related Utility Functions + */ + + public static function classify_tag_type( $is_closer, $is_void ) { + if ( $is_void ) { + return 'void'; + } + + return $is_closer ? 'closer' : 'opener'; + } + + /** + * @see https://html.spec.whatwg.org/#elements-2 + */ + public static function is_html_void_element( $tag_name ) { + switch ( $tag_name ) { + case 'AREA': + case 'BASE': + case 'BR': + case 'COL': + case 'EMBED': + case 'HR': + case 'IMG': + case 'INPUT': + case 'LINK': + case 'META': + case 'SOURCE': + case 'TRACK': + case 'WBR': + return true; + + default: + return false; + } + } +} diff --git a/phpunit/html/wp-html-processor-test.php b/phpunit/html/wp-html-processor-test.php new file mode 100644 index 0000000000000..ae9089d761f4f --- /dev/null +++ b/phpunit/html/wp-html-processor-test.php @@ -0,0 +1,225 @@ +outside

inside
' ); + + $tags->next_tag( 'div' ); + $state = $tags->new_state(); + $this->assertFalse( $tags->balanced_next( $state, 'img' ) ); + + $this->assertTrue( $tags->next_tag( 'div' ) ); + $state = $tags->new_state(); + $this->assertTrue( $tags->balanced_next( $state, 'img' ) ); + } + + public function test_find_immediate_child_tag() { + $tags = new WP_HTML_Processor( '
' ); + + $tags->next_tag( 'div' ); + $state = $tags->new_state(); + $state->match_depth = 1; + $this->assertFalse( $tags->balanced_next( $state, 'img' ) ); + } + + public function test_find_immediate_child_tag2() { + $tags = new WP_HTML_Processor( '
' ); + + $tags->next_tag( 'div' ); + $state = $tags->new_state(); + $state->match_depth = 1; + $this->assertTrue( $tags->balanced_next( $state, 'img' ), 'Did not find the wanted ' ); + $this->assertTrue( $tags->get_attribute( 'wanted' ), 'Found the wrong ' ); + } + + public function test_find_child_tag() { + $tags = new WP_HTML_Processor( '
' ); + + $tags->next_tag( 'div' ); + $state = $tags->new_state(); + $state->match_depth = 3; + $this->assertTrue( $tags->balanced_next( $state, 'img' ) ); + } + + public function test_flushes_up_to_close_tag_from_deep_within() { + $tags = new WP_HTML_Processor( << +
+

Cows

+
+

Cows are clever.

+

Cows eat grass.

+
+

Things cows can't do

+
    +
  • Pilot aeroplanes

  • +
  • Drive race cars

  • +
  • Captain ships

  • +
+

This concludes our discussion of cows.

+
+
+

Oxen

+
+

Oxen are strong.

+
+
+ +HTML + ); + + $tags->next_tag( 'section' ); + $state = $tags->new_state(); + + // Jump inside this tag + $tags->balanced_next( $state, 'p' ); + $this->assertTrue( $tags->get_attribute( 'start' ) ); + // Then exit the outer section we were scanning. + while ( $tags->balanced_next( $state ) ) { + continue; + } + + $this->assertEquals( 'SECTION', $tags->get_tag() ); + $tags->next_tag( 'p' ); + $this->assertTrue( $tags->get_attribute( 'wanted' ) ); + } + + public function test_can_navigate_with_unique_state_throughout_structure() { + $tags = new WP_HTML_Processor( << +
+

Cows

+
+

Cows are clever.

+

Cows eat grass.

+
+

Things cows can't do

+
    +
  • Pilot aeroplanes

  • +
  • Drive race cars

  • +
  • Captain ships

  • +
+

This concludes our discussion of cows.

+
+
+

Oxen

+
+

Oxen are strong.

+
+
+ +HTML + ); + + $tags->next_tag( 'section' ); + $state = $tags->new_state(); + + // Jump inside this tag + $tags->balanced_next( $state, 'p' ); + $this->assertTrue( $tags->get_attribute( 'start' ) ); + + // Establish a new state/frame for navigating inside the outer structure. + $tags->balanced_next( $state, 'ul' ); + $li_count = 0; + $li_state = $tags->new_state(); + while ( $tags->balanced_next( $li_state, 'li' ) ) { + $li_count++; + } + $this->assertEquals( 3, $li_count ); + + // Ensure that we ended up where we expected. + $this->assertEquals( 'UL', $tags->get_tag() ); + $this->assertTrue( $tags->is_tag_closer() ); + $tags->next_tag(); + $this->assertTrue( $tags->get_attribute( 'inner' ) ); + + // And now flush out the previous stack/frame + while ( $tags->balanced_next( $state ) ) { + continue; + } + + // Ensure that we're back where we want to be after exiting two separate frames. + $this->assertEquals( 'P', $tags->get_tag() ); + $this->assertTrue( $tags->is_tag_closer() ); + $tags->next_tag( 'p' ); + $this->assertTrue( $tags->get_attribute( 'wanted' ) ); + } + + public function test_can_scan_through_tags_at_a_given_depth() { + $tags = new WP_HTML_Processor( << +
+

Cows

+
+

Cows are clever.

+

Cows eat grass.

+
+

Things cows can't do

+
    +
  • Pilot aeroplanes

  • +
  • Drive race cars

  • +
  • Captain ships

  • +
+

Things cows can do

+
    +
  • Chew cud

  • +
  • Moo

  • +
+

This concludes our discussion of cows.

+
+
+

Oxen

+
+

Oxen are strong.

+
+
+ +HTML + ); + + $tags->next_tag( 'section' ); + $state = $tags->new_state(); + $state->match_depth = 3; + + $p3_count = 0; + while ( $tags->balanced_next( $state, 'p' ) ) { + $p3_count++; + } + + // Did we only visit the tags inside section > * > * > p? + $this->assertEquals( 5, $p3_count ); + + $state = $tags->new_state(); + $state->match_depth = 2; + + $p2_count = 0; + while ( $tags->balanced_next( $state, 'p' ) ) { + $p2_count++; + } + + // Did we only visit the tags inside section > * > p? + $this->assertEquals( 1, $p2_count ); + } +} From 790dc56ad4438be79ec79c6016db02765936d558 Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Mon, 30 Jan 2023 17:25:03 +0100 Subject: [PATCH 02/18] Make WP_HTML_Tag_Processor::parse_query and ::matches protected --- lib/experimental/html/class-wp-html-tag-processor.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/experimental/html/class-wp-html-tag-processor.php b/lib/experimental/html/class-wp-html-tag-processor.php index 433c16a150806..8bb4e52fde068 100644 --- a/lib/experimental/html/class-wp-html-tag-processor.php +++ b/lib/experimental/html/class-wp-html-tag-processor.php @@ -1900,7 +1900,7 @@ public function get_updated_html() { * @type string $tag_closers "visit" or "skip": whether to stop on tag closers, e.g. . * } */ - private function parse_query( $query ) { + protected function parse_query( $query ) { if ( null !== $query && $query === $this->last_query ) { return; } @@ -1947,7 +1947,7 @@ private function parse_query( $query ) { * * @return boolean */ - private function matches() { + protected function matches() { if ( $this->is_closing_tag && ! $this->stop_on_tag_closers ) { return false; } From 19fe8373d51fac2b1fae25c9c23f337453f39106 Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Mon, 30 Jan 2023 17:25:24 +0100 Subject: [PATCH 03/18] Add to wp-html.php --- lib/experimental/html/wp-html.php | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/experimental/html/wp-html.php b/lib/experimental/html/wp-html.php index dd3aeb7af45ae..0e5d0f735bc06 100644 --- a/lib/experimental/html/wp-html.php +++ b/lib/experimental/html/wp-html.php @@ -21,3 +21,7 @@ if ( ! class_exists( 'WP_HTML_Tag_Processor' ) ) { require_once __DIR__ . '/class-wp-html-tag-processor.php'; } + +if ( ! class_exists( 'WP_HTML_Processor' ) ) { + require_once __DIR__ . '/class-wp-html-processor.php'; +} From 6e623a1fbe3be0da81d3619b69c8a94d10ec72d5 Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Mon, 30 Jan 2023 17:25:36 +0100 Subject: [PATCH 04/18] Fix import path --- phpunit/html/wp-html-processor-test.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phpunit/html/wp-html-processor-test.php b/phpunit/html/wp-html-processor-test.php index ae9089d761f4f..fbe9bc4166ceb 100644 --- a/phpunit/html/wp-html-processor-test.php +++ b/phpunit/html/wp-html-processor-test.php @@ -7,7 +7,7 @@ * @subpackage HTML */ -require_once __DIR__ . '/../../lib/experimental/html/index.php'; +require_once __DIR__ . '/../../lib/experimental/html/wp-html.php'; if ( ! function_exists( 'esc_attr' ) ) { function esc_attr( $s ) { return htmlentities( $s, ENT_QUOTES, null, false ); } From 4a4fabca761e6a9a6d4945da946038369d52b943 Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Mon, 30 Jan 2023 17:27:50 +0100 Subject: [PATCH 05/18] Format --- .../html/class-wp-html-processor.php | 14 ++++++------ phpunit/html/wp-html-processor-test.php | 22 +++++++++++-------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index 6fa236aa94a33..79f13814a135b 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -19,8 +19,8 @@ class WP_HTML_Processor_Scan_State { - public $budget = 1000; - public $open_tags = array(); + public $budget = 1000; + public $open_tags = array(); public $match_depth = null; public function relative_depth() { @@ -31,10 +31,10 @@ public function relative_depth() { class WP_HTML_Processor extends WP_HTML_Tag_Processor { public function new_state() { - $state = new WP_HTML_Processor_Scan_State(); + $state = new WP_HTML_Processor_Scan_State(); $tag_name = $this->get_tag(); - if ( ! self::is_html_void_element( $tag_name ) && ! $this->is_tag_closer() ) { + if ( ! self::is_html_void_element( $tag_name ) && ! $this->is_tag_closer() ) { $state->open_tags[] = $tag_name; } @@ -136,15 +136,15 @@ public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = nul public function get_content_inside_balanced_tags() { static $start_name = null; - static $end_name = null; + static $end_name = null; if ( null === $start_name || array_key_exists( $start_name, $this->bookmarks ) ) { - $rand_id = rand( 1, PHP_INT_MAX ); + $rand_id = rand( 1, PHP_INT_MAX ); $start_name = "start_{$rand_id}"; } if ( null === $end_name || array_key_exists( $end_name, $this->bookmarks ) ) { - $rand_id = rand( 1, PHP_INT_MAX ); + $rand_id = rand( 1, PHP_INT_MAX ); $end_name = "start_{$rand_id}"; } diff --git a/phpunit/html/wp-html-processor-test.php b/phpunit/html/wp-html-processor-test.php index fbe9bc4166ceb..5d3696536fb7a 100644 --- a/phpunit/html/wp-html-processor-test.php +++ b/phpunit/html/wp-html-processor-test.php @@ -10,7 +10,8 @@ require_once __DIR__ . '/../../lib/experimental/html/wp-html.php'; if ( ! function_exists( 'esc_attr' ) ) { - function esc_attr( $s ) { return htmlentities( $s, ENT_QUOTES, null, false ); } + function esc_attr( $s ) { + return htmlentities( $s, ENT_QUOTES, null, false ); } } if ( ! class_exists( 'WP_UnitTestCase' ) ) { @@ -39,7 +40,7 @@ public function test_find_immediate_child_tag() { $tags = new WP_HTML_Processor( '
' ); $tags->next_tag( 'div' ); - $state = $tags->new_state(); + $state = $tags->new_state(); $state->match_depth = 1; $this->assertFalse( $tags->balanced_next( $state, 'img' ) ); } @@ -48,7 +49,7 @@ public function test_find_immediate_child_tag2() { $tags = new WP_HTML_Processor( '
' ); $tags->next_tag( 'div' ); - $state = $tags->new_state(); + $state = $tags->new_state(); $state->match_depth = 1; $this->assertTrue( $tags->balanced_next( $state, 'img' ), 'Did not find the wanted ' ); $this->assertTrue( $tags->get_attribute( 'wanted' ), 'Found the wrong ' ); @@ -58,13 +59,14 @@ public function test_find_child_tag() { $tags = new WP_HTML_Processor( '
' ); $tags->next_tag( 'div' ); - $state = $tags->new_state(); + $state = $tags->new_state(); $state->match_depth = 3; $this->assertTrue( $tags->balanced_next( $state, 'img' ) ); } public function test_flushes_up_to_close_tag_from_deep_within() { - $tags = new WP_HTML_Processor( <<

Cows

@@ -107,7 +109,8 @@ public function test_flushes_up_to_close_tag_from_deep_within() { } public function test_can_navigate_with_unique_state_throughout_structure() { - $tags = new WP_HTML_Processor( <<

Cows

@@ -168,7 +171,8 @@ public function test_can_navigate_with_unique_state_throughout_structure() { } public function test_can_scan_through_tags_at_a_given_depth() { - $tags = new WP_HTML_Processor( <<

Cows

@@ -200,7 +204,7 @@ public function test_can_scan_through_tags_at_a_given_depth() { ); $tags->next_tag( 'section' ); - $state = $tags->new_state(); + $state = $tags->new_state(); $state->match_depth = 3; $p3_count = 0; @@ -211,7 +215,7 @@ public function test_can_scan_through_tags_at_a_given_depth() { // Did we only visit the tags inside section > * > * > p? $this->assertEquals( 5, $p3_count ); - $state = $tags->new_state(); + $state = $tags->new_state(); $state->match_depth = 2; $p2_count = 0; From 820b4513334a5c2229913b9b1e99d389b87ace65 Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Mon, 30 Jan 2023 17:43:22 +0100 Subject: [PATCH 06/18] Move WP_HTML_Processor_Scan_State to separate file --- .../class-wp-html-processor-scan-state.php | 18 ++++++++++++++++++ .../html/class-wp-html-processor.php | 10 ---------- lib/experimental/html/wp-html.php | 4 ++++ 3 files changed, 22 insertions(+), 10 deletions(-) create mode 100644 lib/experimental/html/class-wp-html-processor-scan-state.php diff --git a/lib/experimental/html/class-wp-html-processor-scan-state.php b/lib/experimental/html/class-wp-html-processor-scan-state.php new file mode 100644 index 0000000000000..c4c8cc1c2f7b4 --- /dev/null +++ b/lib/experimental/html/class-wp-html-processor-scan-state.php @@ -0,0 +1,18 @@ +open_tags ); + } +} diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index 79f13814a135b..ef1c91dc44904 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -18,16 +18,6 @@ */ -class WP_HTML_Processor_Scan_State { - public $budget = 1000; - public $open_tags = array(); - public $match_depth = null; - - public function relative_depth() { - return count( $this->open_tags ); - } -} - class WP_HTML_Processor extends WP_HTML_Tag_Processor { public function new_state() { diff --git a/lib/experimental/html/wp-html.php b/lib/experimental/html/wp-html.php index 0e5d0f735bc06..68e7c57c7aaf3 100644 --- a/lib/experimental/html/wp-html.php +++ b/lib/experimental/html/wp-html.php @@ -22,6 +22,10 @@ require_once __DIR__ . '/class-wp-html-tag-processor.php'; } +if ( ! class_exists( 'WP_HTML_Processor_Scan_State' ) ) { + require_once __DIR__ . '/class-wp-html-processor-scan-state.php'; +} + if ( ! class_exists( 'WP_HTML_Processor' ) ) { require_once __DIR__ . '/class-wp-html-processor.php'; } From 4cf7b9c8d6deb02355e47e092f90bfb05e275653 Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Mon, 30 Jan 2023 18:00:32 +0100 Subject: [PATCH 07/18] Add trailing periods to comments in test file --- phpunit/html/wp-html-processor-test.php | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/phpunit/html/wp-html-processor-test.php b/phpunit/html/wp-html-processor-test.php index 5d3696536fb7a..5582a988bf067 100644 --- a/phpunit/html/wp-html-processor-test.php +++ b/phpunit/html/wp-html-processor-test.php @@ -95,7 +95,7 @@ public function test_flushes_up_to_close_tag_from_deep_within() { $tags->next_tag( 'section' ); $state = $tags->new_state(); - // Jump inside this tag + // Jump inside this tag. $tags->balanced_next( $state, 'p' ); $this->assertTrue( $tags->get_attribute( 'start' ) ); // Then exit the outer section we were scanning. @@ -139,7 +139,7 @@ public function test_can_navigate_with_unique_state_throughout_structure() { $tags->next_tag( 'section' ); $state = $tags->new_state(); - // Jump inside this tag + // Jump inside this tag. $tags->balanced_next( $state, 'p' ); $this->assertTrue( $tags->get_attribute( 'start' ) ); @@ -158,7 +158,7 @@ public function test_can_navigate_with_unique_state_throughout_structure() { $tags->next_tag(); $this->assertTrue( $tags->get_attribute( 'inner' ) ); - // And now flush out the previous stack/frame + // And now flush out the previous stack/frame. while ( $tags->balanced_next( $state ) ) { continue; } From a1d4fb11c6b7fcc8feb47ec5265fca70ee3de473 Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Mon, 30 Jan 2023 18:00:59 +0100 Subject: [PATCH 08/18] Add class-level PHPDoc to WP_HTML_Processor_Scan_State --- .../html/class-wp-html-processor-scan-state.php | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/experimental/html/class-wp-html-processor-scan-state.php b/lib/experimental/html/class-wp-html-processor-scan-state.php index c4c8cc1c2f7b4..87fbdb3329220 100644 --- a/lib/experimental/html/class-wp-html-processor-scan-state.php +++ b/lib/experimental/html/class-wp-html-processor-scan-state.php @@ -7,6 +7,16 @@ * @since 6.2.0 */ +/** + * Track opening tags and scanning depth. + * + * This class is for internal usage of the WP_HTML_Processor class. + * + * @access private + * @since 6.2.0 + * + * @see WP_HTML_Processor + */ class WP_HTML_Processor_Scan_State { public $budget = 1000; public $open_tags = array(); From 451c20b4eee975ea96647df08bd003e5d1d6f272 Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Mon, 30 Jan 2023 18:11:28 +0100 Subject: [PATCH 09/18] PHPDoc for is_html_void_element --- lib/experimental/html/class-wp-html-processor.php | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index ef1c91dc44904..deba382517552 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -179,6 +179,11 @@ public static function classify_tag_type( $is_closer, $is_void ) { } /** + * Whether a given HTML element is void (e.g.
). + * + * @param string $tag_name The element in question. + * @return bool True if the element is void. + * * @see https://html.spec.whatwg.org/#elements-2 */ public static function is_html_void_element( $tag_name ) { From 39a8d40dd09ba2420731cf8458b57a6d458771ec Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Mon, 30 Jan 2023 18:12:56 +0100 Subject: [PATCH 10/18] Add PHPDoc for classify_tag_type --- lib/experimental/html/class-wp-html-processor.php | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index deba382517552..2b6fca9f70ea4 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -170,6 +170,16 @@ private function content_inside_bookmarks( $start_bookmark, $end_bookmark ) { * HTML-related Utility Functions */ + /** + * Classify a given HTML tag type. + * + * Return 'opener' for an opening element, 'closer' for a closing element, + * and 'void' for a void element. + * + * @param bool $is_closer Whether the current element is a closing element. + * @param bool $is_void Whether the current element is a void element. + * @return 'opener'|'closer'|'void' The type of element in question. + */ public static function classify_tag_type( $is_closer, $is_void ) { if ( $is_void ) { return 'void'; From 7c7e09908c8b0fb1c1ab762e68241c945c889cb7 Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Tue, 31 Jan 2023 10:28:04 +0100 Subject: [PATCH 11/18] Add PHPDoc for balanced_next --- lib/experimental/html/class-wp-html-processor.php | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index 2b6fca9f70ea4..5b3b30d0b2754 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -31,6 +31,20 @@ public function new_state() { return $state; } + /** + * Find the matching closing tag for an opening tag. + * + * When called while on an open tag, move to the matching closing tag, + * respecting any in-between content, including nested tags of the same + * name. Return false when called on a closing or void tag, or if no + * matching closing tag was found. + * + * @param WP_HTML_Processor_Scan_State $state Tracking state. + * @param array|string $query Query criteria for the closing tag. + * @return bool True if a matching closing tag was found. + * + * @see WP_HTML_Tag_Processor::parse_query + */ public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = null ) { while ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) && $state->budget-- > 0 ) { $tag_name = $this->get_tag(); From 57a896b96112df1b6b18f849b042e0d76509ea72 Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Tue, 31 Jan 2023 10:28:26 +0100 Subject: [PATCH 12/18] Fix Yoda condition --- lib/experimental/html/class-wp-html-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index 5b3b30d0b2754..5308ce2bf87c9 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -112,7 +112,7 @@ public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = nul * when existing this function, but by tracking it like this we don't * have to remember to do that. */ - $depth = $type === 'void' + $depth = 'void' === $type ? $state->relative_depth() + 1 : $state->relative_depth(); From 0cc536c74add0972d87c03078dcd3d161d24a3d4 Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Tue, 31 Jan 2023 10:37:39 +0100 Subject: [PATCH 13/18] More PHPDoc --- .../html/class-wp-html-processor.php | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index 5308ce2bf87c9..b7027ed24db18 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -15,11 +15,19 @@ * input. E.g. it's clear to me how to handle `
` but not clear to how * handle `

` given that `` is a formatting element but `

` is * not, that `

` itself is a special element. + * + * @package WordPress + * @subpackage HTML + * @since 6.2.0 */ - class WP_HTML_Processor extends WP_HTML_Tag_Processor { + /** + * Create a new tracking state for, based on the current opening tag. + * + * @return WP_HTML_Processor_Scan_State + */ public function new_state() { $state = new WP_HTML_Processor_Scan_State(); $tag_name = $this->get_tag(); @@ -138,6 +146,14 @@ public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = nul return false; } + /** + * Return the content between two balanced tags. + * + * When called on an opening tag, return the HTML content found between + * that opening tag and its matching closing tag. + * + * @return string The content between the current opening and its matching closing tag. + */ public function get_content_inside_balanced_tags() { static $start_name = null; static $end_name = null; @@ -169,6 +185,14 @@ public function get_content_inside_balanced_tags() { return $content; } + /** + * Return the content between two bookmarks. + * + * @param WP_HTML_Span $start_bookmark The bookmark marking the start of the content. + * @param WP_HTML_Span $end_bookmark The bookmark marking the start of the content. + * @return string|null The content between the two bookmarks. + * Null if either of the bookmarks isn't set. + */ private function content_inside_bookmarks( $start_bookmark, $end_bookmark ) { if ( ! isset( $this->bookmarks[ $start_bookmark ], $this->bookmarks[ $end_bookmark ] ) ) { return null; From d350d2606db0952652dc2a6925aeca260d21a958 Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Tue, 31 Jan 2023 11:13:26 +0100 Subject: [PATCH 14/18] Add PHPDoc for WP_HTML_Processor class --- .../html/class-wp-html-processor.php | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index b7027ed24db18..26e7d1d143c47 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -21,7 +21,36 @@ * @since 6.2.0 */ - +/** + * Processes an input HTML document by applying a specified set of patches + * to that input. Retrieves content between matching opening and closing tags. + * Tokenizes HTML but does not fully parse the input document. + * + * ## Usage + * + * Note that this is a subclass of `WP_HTML_Tag_Processor`. Most of the + * functionality of this class is thus covered by `WP_HTML_Tag_Processor`'s + * documentation. + * The following documentation covers the additional features added by + * `WP_HTML_Processor`. + * + * ### Retrieving content + * + * When on an opening tag, it's possible to retrieve the content enclosed between + * that opening tag and its matching closing tag. + * + * Example: + * ```php + * $html = '

Inner div content
'; + * $tags = new WP_HTML_Processor( $html ); + * $tags->next_tag( [ 'tag_name' => 'div' ]; + * $label = $tags->get_content_inside_balanced_tags(); + * // $label === '
Inner div content
' + * } + * ``` + * + * @see WP_HTML_Tag_Processor + */ class WP_HTML_Processor extends WP_HTML_Tag_Processor { /** * Create a new tracking state for, based on the current opening tag. From 4200ddd97e42b278f02a83409482e955f3b2d8da Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Tue, 31 Jan 2023 11:16:51 +0100 Subject: [PATCH 15/18] Add short description for class-wp-html-processor.php --- lib/experimental/html/class-wp-html-processor.php | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index 26e7d1d143c47..5d571a840aa69 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -1,6 +1,11 @@ Date: Tue, 31 Jan 2023 11:34:56 +0100 Subject: [PATCH 16/18] Add PHPDoc for WP_HTML_Processor_Scan_State methods --- .../class-wp-html-processor-scan-state.php | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/lib/experimental/html/class-wp-html-processor-scan-state.php b/lib/experimental/html/class-wp-html-processor-scan-state.php index 87fbdb3329220..c826bb68cbc29 100644 --- a/lib/experimental/html/class-wp-html-processor-scan-state.php +++ b/lib/experimental/html/class-wp-html-processor-scan-state.php @@ -18,10 +18,32 @@ * @see WP_HTML_Processor */ class WP_HTML_Processor_Scan_State { - public $budget = 1000; - public $open_tags = array(); + /** + * The maximum number of tags we'll traverse in search of a matching closing tag. + * + * @var integer + */ + public $budget = 1000; + + /** + * A stack of the opening tags that we have visited. + * + * @var string[] + */ + public $open_tags = array(); + + /** + * The maximum depth of nested tags we're willing to traverse. + * + * @var int + */ public $match_depth = null; + /** + * The depth of nested opening tags, counted from where we started. + * + * @return int The depth of nested tags. + */ public function relative_depth() { return count( $this->open_tags ); } From 45de52270e007f3a9266924f985f3aec0ea9f9b5 Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Tue, 31 Jan 2023 11:36:35 +0100 Subject: [PATCH 17/18] Rephrase a bit --- lib/experimental/html/class-wp-html-processor.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index 5d571a840aa69..482065d813623 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -76,10 +76,10 @@ public function new_state() { /** * Find the matching closing tag for an opening tag. * - * When called while on an open tag, move to the matching closing tag, - * respecting any in-between content, including nested tags of the same - * name. Return false when called on a closing or void tag, or if no - * matching closing tag was found. + * When called while on an open tag, traverse the HTML until we find + * the matching closing tag, respecting any in-between content, including + * nested tags of the same name. Return false when called on a closing or + * void tag, or if no matching closing tag was found. * * @param WP_HTML_Processor_Scan_State $state Tracking state. * @param array|string $query Query criteria for the closing tag. From 35bd051be9c4d7b94f4df3e9c9cbc2e5b530c13c Mon Sep 17 00:00:00 2001 From: Bernie Reiter Date: Tue, 31 Jan 2023 11:51:12 +0100 Subject: [PATCH 18/18] Fix a member var PHPDoc --- lib/experimental/html/class-wp-html-processor-scan-state.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/experimental/html/class-wp-html-processor-scan-state.php b/lib/experimental/html/class-wp-html-processor-scan-state.php index c826bb68cbc29..5f02ec86245c5 100644 --- a/lib/experimental/html/class-wp-html-processor-scan-state.php +++ b/lib/experimental/html/class-wp-html-processor-scan-state.php @@ -33,7 +33,7 @@ class WP_HTML_Processor_Scan_State { public $open_tags = array(); /** - * The maximum depth of nested tags we're willing to traverse. + * The depth of nested tags at which we expect to find the matching closing tag. * * @var int */