diff --git a/lib/experimental/html/class-wp-html-processor-scan-state.php b/lib/experimental/html/class-wp-html-processor-scan-state.php
new file mode 100644
index 00000000000000..5f02ec86245c54
--- /dev/null
+++ b/lib/experimental/html/class-wp-html-processor-scan-state.php
@@ -0,0 +1,50 @@
+open_tags );
+ }
+}
diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php
new file mode 100644
index 00000000000000..482065d813623c
--- /dev/null
+++ b/lib/experimental/html/class-wp-html-processor.php
@@ -0,0 +1,292 @@
+` but not clear to how
+ * handle `
` given that `` is a formatting element but `` is
+ * not, that `
` itself is a special element.
+ *
+ * @package WordPress
+ * @subpackage HTML
+ * @since 6.2.0
+ */
+
+/**
+ * Processes an input HTML document by applying a specified set of patches
+ * to that input. Retrieves content between matching opening and closing tags.
+ * Tokenizes HTML but does not fully parse the input document.
+ *
+ * ## Usage
+ *
+ * Note that this is a subclass of `WP_HTML_Tag_Processor`. Most of the
+ * functionality of this class is thus covered by `WP_HTML_Tag_Processor`'s
+ * documentation.
+ * The following documentation covers the additional features added by
+ * `WP_HTML_Processor`.
+ *
+ * ### Retrieving content
+ *
+ * When on an opening tag, it's possible to retrieve the content enclosed between
+ * that opening tag and its matching closing tag.
+ *
+ * Example:
+ * ```php
+ * $html = '
Inner div content
![]()
';
+ * $tags = new WP_HTML_Processor( $html );
+ * $tags->next_tag( [ 'tag_name' => 'div' ];
+ * $label = $tags->get_content_inside_balanced_tags();
+ * // $label === 'Inner div content
'
+ * }
+ * ```
+ *
+ * @see WP_HTML_Tag_Processor
+ */
+class WP_HTML_Processor extends WP_HTML_Tag_Processor {
+ /**
+ * Create a new tracking state for, based on the current opening tag.
+ *
+ * @return WP_HTML_Processor_Scan_State
+ */
+ public function new_state() {
+ $state = new WP_HTML_Processor_Scan_State();
+ $tag_name = $this->get_tag();
+
+ if ( ! self::is_html_void_element( $tag_name ) && ! $this->is_tag_closer() ) {
+ $state->open_tags[] = $tag_name;
+ }
+
+ return $state;
+ }
+
+ /**
+ * Find the matching closing tag for an opening tag.
+ *
+ * When called while on an open tag, traverse the HTML until we find
+ * the matching closing tag, respecting any in-between content, including
+ * nested tags of the same name. Return false when called on a closing or
+ * void tag, or if no matching closing tag was found.
+ *
+ * @param WP_HTML_Processor_Scan_State $state Tracking state.
+ * @param array|string $query Query criteria for the closing tag.
+ * @return bool True if a matching closing tag was found.
+ *
+ * @see WP_HTML_Tag_Processor::parse_query
+ */
+ public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = null ) {
+ while ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) && $state->budget-- > 0 ) {
+ $tag_name = $this->get_tag();
+ $is_closer = $this->is_tag_closer();
+ $is_void = self::is_html_void_element( $tag_name );
+ $type = self::classify_tag_type( $is_closer, $is_void );
+
+ /*
+ * Step 1. Update the stack of open tags.
+ *
+ * If and when we add more complete HTML parsing support we will also
+ * need to track the stack of active formats so that we can properly
+ * handle missing tags and overlapping tags.
+ */
+
+ switch ( $type ) {
+ case 'void':
+ /*
+ * Void tags (such as
) can't have children and so we
+ * won't push or pop them from the stack of open tags.
+ *
+ * If and when we support self-closing foreign tags we would
+ * need to separately track those, but their behavior matches
+ * this case. The self-closing flag is ignored for HTML5 tags.
+ */
+ if ( 0 === $state->relative_depth() ) {
+ return false;
+ }
+
+ break;
+
+ case 'opener':
+ $state->open_tags[] = $tag_name;
+ break;
+
+ case 'closer':
+ $last_tag = array_pop( $state->open_tags );
+
+ /*
+ * Currently we can only support fully-normative and balanced HTML5.
+ * If we encounter anything we don't expect then we will bail. In a
+ * future update we may perform more careful HTML parsing and unlock
+ * navigating through non-normative documents.
+ */
+ if ( $last_tag !== $tag_name ) {
+ return false;
+ }
+
+ /*
+ * Step 2. Bail if we've reached the end of the tag in which we started.
+ */
+ if ( 0 === $state->relative_depth() ) {
+ return false;
+ }
+
+ break;
+ }
+
+ /*
+ * Void elements don't enter the stack, but they do exist in the
+ * depth hierarchy, so we have to temporarily account for that.
+ *
+ * We could have followed the approach in the HTML5 spec by appending
+ * the void tag to the stack of open tags, and then remember to pop it
+ * when existing this function, but by tracking it like this we don't
+ * have to remember to do that.
+ */
+ $depth = 'void' === $type
+ ? $state->relative_depth() + 1
+ : $state->relative_depth();
+
+ /*
+ * Step 3. Determine if we have a matching tag. In addition to the query
+ * we pass along to the underlying tag processor we're going to allow
+ * specifying the relative depth for a match. For example, a CSS child
+ * combinator would specify that a match must have a relative depth of 1,
+ * indicating that it's a direct child of the surrounding element, whereas
+ * the descendant selector could match at any depth and so sets this to `null`.
+ * To prevent matching _above_ a tag we rely on the `bail_depth` to stop
+ * searching once we've exited the tag on which we started, or reach its parent.
+ */
+
+ if ( ! isset( $state->match_depth ) || $state->match_depth + 1 === $depth ) {
+ $this->parse_query( $query );
+ if ( $this->matches() ) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Return the content between two balanced tags.
+ *
+ * When called on an opening tag, return the HTML content found between
+ * that opening tag and its matching closing tag.
+ *
+ * @return string The content between the current opening and its matching closing tag.
+ */
+ public function get_content_inside_balanced_tags() {
+ static $start_name = null;
+ static $end_name = null;
+
+ if ( null === $start_name || array_key_exists( $start_name, $this->bookmarks ) ) {
+ $rand_id = rand( 1, PHP_INT_MAX );
+ $start_name = "start_{$rand_id}";
+ }
+
+ if ( null === $end_name || array_key_exists( $end_name, $this->bookmarks ) ) {
+ $rand_id = rand( 1, PHP_INT_MAX );
+ $end_name = "start_{$rand_id}";
+ }
+
+ $this->set_bookmark( $start_name );
+
+ $state = self::new_state();
+ while ( $this->balanced_next( $state ) ) {
+ continue;
+ }
+
+ $this->set_bookmark( $end_name );
+ $content = $this->content_inside_bookmarks( $start_name, $end_name );
+ $this->seek( $start_name );
+
+ $this->release_bookmark( $start_name );
+ $this->release_bookmark( $end_name );
+
+ return $content;
+ }
+
+ /**
+ * Return the content between two bookmarks.
+ *
+ * @param WP_HTML_Span $start_bookmark The bookmark marking the start of the content.
+ * @param WP_HTML_Span $end_bookmark The bookmark marking the start of the content.
+ * @return string|null The content between the two bookmarks.
+ * Null if either of the bookmarks isn't set.
+ */
+ private function content_inside_bookmarks( $start_bookmark, $end_bookmark ) {
+ if ( ! isset( $this->bookmarks[ $start_bookmark ], $this->bookmarks[ $end_bookmark ] ) ) {
+ return null;
+ }
+
+ $start = $this->bookmarks[ $start_bookmark ];
+ $end = $this->bookmarks[ $end_bookmark ];
+
+ return substr( $this->get_updated_html(), $start->end + 1, $end->start - $start->end - 2 );
+ }
+
+ /*
+ * HTML-related Utility Functions
+ */
+
+ /**
+ * Classify a given HTML tag type.
+ *
+ * Return 'opener' for an opening element, 'closer' for a closing element,
+ * and 'void' for a void element.
+ *
+ * @param bool $is_closer Whether the current element is a closing element.
+ * @param bool $is_void Whether the current element is a void element.
+ * @return 'opener'|'closer'|'void' The type of element in question.
+ */
+ public static function classify_tag_type( $is_closer, $is_void ) {
+ if ( $is_void ) {
+ return 'void';
+ }
+
+ return $is_closer ? 'closer' : 'opener';
+ }
+
+ /**
+ * Whether a given HTML element is void (e.g.
).
+ *
+ * @param string $tag_name The element in question.
+ * @return bool True if the element is void.
+ *
+ * @see https://html.spec.whatwg.org/#elements-2
+ */
+ public static function is_html_void_element( $tag_name ) {
+ switch ( $tag_name ) {
+ case 'AREA':
+ case 'BASE':
+ case 'BR':
+ case 'COL':
+ case 'EMBED':
+ case 'HR':
+ case 'IMG':
+ case 'INPUT':
+ case 'LINK':
+ case 'META':
+ case 'SOURCE':
+ case 'TRACK':
+ case 'WBR':
+ return true;
+
+ default:
+ return false;
+ }
+ }
+}
diff --git a/lib/experimental/html/class-wp-html-tag-processor.php b/lib/experimental/html/class-wp-html-tag-processor.php
index 433c16a150806c..8bb4e52fde0685 100644
--- a/lib/experimental/html/class-wp-html-tag-processor.php
+++ b/lib/experimental/html/class-wp-html-tag-processor.php
@@ -1900,7 +1900,7 @@ public function get_updated_html() {
* @type string $tag_closers "visit" or "skip": whether to stop on tag closers, e.g. .
* }
*/
- private function parse_query( $query ) {
+ protected function parse_query( $query ) {
if ( null !== $query && $query === $this->last_query ) {
return;
}
@@ -1947,7 +1947,7 @@ private function parse_query( $query ) {
*
* @return boolean
*/
- private function matches() {
+ protected function matches() {
if ( $this->is_closing_tag && ! $this->stop_on_tag_closers ) {
return false;
}
diff --git a/lib/experimental/html/wp-html.php b/lib/experimental/html/wp-html.php
index dd3aeb7af45ae9..68e7c57c7aaf3e 100644
--- a/lib/experimental/html/wp-html.php
+++ b/lib/experimental/html/wp-html.php
@@ -21,3 +21,11 @@
if ( ! class_exists( 'WP_HTML_Tag_Processor' ) ) {
require_once __DIR__ . '/class-wp-html-tag-processor.php';
}
+
+if ( ! class_exists( 'WP_HTML_Processor_Scan_State' ) ) {
+ require_once __DIR__ . '/class-wp-html-processor-scan-state.php';
+}
+
+if ( ! class_exists( 'WP_HTML_Processor' ) ) {
+ require_once __DIR__ . '/class-wp-html-processor.php';
+}
diff --git a/phpunit/html/wp-html-processor-test.php b/phpunit/html/wp-html-processor-test.php
new file mode 100644
index 00000000000000..5582a988bf067e
--- /dev/null
+++ b/phpunit/html/wp-html-processor-test.php
@@ -0,0 +1,229 @@
+outside![]()
inside
' );
+
+ $tags->next_tag( 'div' );
+ $state = $tags->new_state();
+ $this->assertFalse( $tags->balanced_next( $state, 'img' ) );
+
+ $this->assertTrue( $tags->next_tag( 'div' ) );
+ $state = $tags->new_state();
+ $this->assertTrue( $tags->balanced_next( $state, 'img' ) );
+ }
+
+ public function test_find_immediate_child_tag() {
+ $tags = new WP_HTML_Processor( '' );
+
+ $tags->next_tag( 'div' );
+ $state = $tags->new_state();
+ $state->match_depth = 1;
+ $this->assertFalse( $tags->balanced_next( $state, 'img' ) );
+ }
+
+ public function test_find_immediate_child_tag2() {
+ $tags = new WP_HTML_Processor( '' );
+
+ $tags->next_tag( 'div' );
+ $state = $tags->new_state();
+ $state->match_depth = 1;
+ $this->assertTrue( $tags->balanced_next( $state, 'img' ), 'Did not find the wanted
' );
+ $this->assertTrue( $tags->get_attribute( 'wanted' ), 'Found the wrong
' );
+ }
+
+ public function test_find_child_tag() {
+ $tags = new WP_HTML_Processor( '' );
+
+ $tags->next_tag( 'div' );
+ $state = $tags->new_state();
+ $state->match_depth = 3;
+ $this->assertTrue( $tags->balanced_next( $state, 'img' ) );
+ }
+
+ public function test_flushes_up_to_close_tag_from_deep_within() {
+ $tags = new WP_HTML_Processor(
+ <<
+
+ Cows
+
+
Cows are clever.
+
Cows eat grass.
+
+ Things cows can't do
+
+ Pilot aeroplanes
+ Drive race cars
+ Captain ships
+
+ This concludes our discussion of cows.
+
+
+
+HTML
+ );
+
+ $tags->next_tag( 'section' );
+ $state = $tags->new_state();
+
+ // Jump inside this tag.
+ $tags->balanced_next( $state, 'p' );
+ $this->assertTrue( $tags->get_attribute( 'start' ) );
+ // Then exit the outer section we were scanning.
+ while ( $tags->balanced_next( $state ) ) {
+ continue;
+ }
+
+ $this->assertEquals( 'SECTION', $tags->get_tag() );
+ $tags->next_tag( 'p' );
+ $this->assertTrue( $tags->get_attribute( 'wanted' ) );
+ }
+
+ public function test_can_navigate_with_unique_state_throughout_structure() {
+ $tags = new WP_HTML_Processor(
+ <<
+
+ Cows
+
+
Cows are clever.
+
Cows eat grass.
+
+ Things cows can't do
+
+ Pilot aeroplanes
+ Drive race cars
+ Captain ships
+
+ This concludes our discussion of cows.
+
+
+
+HTML
+ );
+
+ $tags->next_tag( 'section' );
+ $state = $tags->new_state();
+
+ // Jump inside this tag.
+ $tags->balanced_next( $state, 'p' );
+ $this->assertTrue( $tags->get_attribute( 'start' ) );
+
+ // Establish a new state/frame for navigating inside the outer structure.
+ $tags->balanced_next( $state, 'ul' );
+ $li_count = 0;
+ $li_state = $tags->new_state();
+ while ( $tags->balanced_next( $li_state, 'li' ) ) {
+ $li_count++;
+ }
+ $this->assertEquals( 3, $li_count );
+
+ // Ensure that we ended up where we expected.
+ $this->assertEquals( 'UL', $tags->get_tag() );
+ $this->assertTrue( $tags->is_tag_closer() );
+ $tags->next_tag();
+ $this->assertTrue( $tags->get_attribute( 'inner' ) );
+
+ // And now flush out the previous stack/frame.
+ while ( $tags->balanced_next( $state ) ) {
+ continue;
+ }
+
+ // Ensure that we're back where we want to be after exiting two separate frames.
+ $this->assertEquals( 'P', $tags->get_tag() );
+ $this->assertTrue( $tags->is_tag_closer() );
+ $tags->next_tag( 'p' );
+ $this->assertTrue( $tags->get_attribute( 'wanted' ) );
+ }
+
+ public function test_can_scan_through_tags_at_a_given_depth() {
+ $tags = new WP_HTML_Processor(
+ <<
+
+ Cows
+
+
Cows are clever.
+
Cows eat grass.
+
+ Things cows can't do
+
+ Pilot aeroplanes
+ Drive race cars
+ Captain ships
+
+ Things cows can do
+
+ This concludes our discussion of cows.
+
+
+
+HTML
+ );
+
+ $tags->next_tag( 'section' );
+ $state = $tags->new_state();
+ $state->match_depth = 3;
+
+ $p3_count = 0;
+ while ( $tags->balanced_next( $state, 'p' ) ) {
+ $p3_count++;
+ }
+
+ // Did we only visit the tags inside section > * > * > p?
+ $this->assertEquals( 5, $p3_count );
+
+ $state = $tags->new_state();
+ $state->match_depth = 2;
+
+ $p2_count = 0;
+ while ( $tags->balanced_next( $state, 'p' ) ) {
+ $p2_count++;
+ }
+
+ // Did we only visit the tags inside section > * > p?
+ $this->assertEquals( 1, $p2_count );
+ }
+}