diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 7c03c484ddb70..f3ee41537cd9e 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -604,16 +604,116 @@ function wpautop( $text, $br = true ) { return $text; } +/** + * Returns a Tag Processor exposing the raw matched tokens. + * + * @since 6.6.0 + * + * @param string $html Passed into the Tag Processor. + * @return WP_HTML_Tag_Processor|__anonymous@23567 + */ +function wp_get_internal_tag_processor( $html ) { + return new class( $html ) extends WP_HTML_Tag_Processor { + /** + * Returns the raw token from the input string at the + * current location, if paused at a location. + * + * @return false|string + */ + public function get_raw_token() { + if ( + WP_HTML_Tag_Processor::STATE_READY === $this->parser_state || + WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state || + WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state + ) { + return false; + } + + $this->set_bookmark( 'here' ); + $here = $this->bookmarks['here']; + + return substr( $this->html, $here->start, $here->length ); + } + }; +} + /** * Separates HTML elements and comments from the text. * + * This function tokenizes an HTML document into its + * components and returns the array of tokens. + * * @since 4.2.4 + * @since 6.6.0 Relies on the HTML API for parsing. * - * @param string $input The text which has to be formatted. - * @return string[] Array of the formatted text. + * @param string $input_html Raw HTML potentially containing a mixture of tags, + * comments, text nodes, and other sytnax. + * @return string[] */ -function wp_html_split( $input ) { - return preg_split( get_html_split_regex(), $input, -1, PREG_SPLIT_DELIM_CAPTURE ); +function wp_html_split( $input_html ) { + $chunks = array(); + $processor = wp_get_internal_tag_processor( $input_html ); + + while ( $processor->next_token() ) { + /* + * There's a legacy behavior where text nodes are always stored in even + * indices and "elements" are stored in odd indices. To preserve this, + * empty text nodes are inserted when there's none between other syntax + * tokens. + */ + if ( 0 === count( $chunks ) % 2 && '#text' !== $processor->get_token_name() ) { + $chunks[] = ''; + } + + $is_special_atomic_element = in_array( + $processor->get_tag(), + array( 'SCRIPT', 'STYLE', 'XMP', 'NOEMBED', 'NOFRAMES', 'TITLE', 'TEXTAREA' ), + true + ); + + if ( ! $is_special_atomic_element ) { + $chunks[] = $processor->get_raw_token(); + continue; + } + + /* + * For special atomic tags, it's necessary to redo some work to find + * the opening and closing tag, because the Tag Processor consumes + * them all in one go. + * + * By replacing the first character of the tag name, it's possible to + * trick the Tag Processor into thinking it's non-special content, and + * then get the starting and ending tags, then restore the tag name at + * the end. + * + * Because the end tag for these special atomic elements are matched + * if they are unexpected, then the final closing tag will be found + * after renaming the opening. + */ + + $raw_html = $processor->get_raw_token(); + $first_char = $raw_html[1]; + $raw_html[1] = 'X'; + $special = wp_get_internal_tag_processor( $raw_html ); + + // The first tag is the modified tag. + $special->next_tag(); + $opening_tag = $special->get_raw_token(); + $opening_tag[1] = $first_char; + $chunks[] = $opening_tag; + + $special->set_bookmark( 'last' ); + while ( $special->next_tag( array( 'tag_closers' => 'visit' ) ) ) { + $special->set_bookmark( 'last' ); + } + $special->seek( 'last' ); + $closing_tag = $special->get_raw_token(); + + $chunks[] = substr( $raw_html, strlen( $opening_tag ), -strlen( $closing_tag ) ); + $chunks[] = $closing_tag; + } + + return $chunks; } /** diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 4597a888b5efe..26d22c072e48e 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -926,8 +926,8 @@ private function base_class_next_token() { return false; } $this->parser_state = self::STATE_MATCHED_TAG; - $this->token_length = $tag_ends_at - $this->token_starts_at; $this->bytes_already_parsed = $tag_ends_at + 1; + $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; /* * For non-DATA sections which might contain text that looks like HTML tags but @@ -1013,7 +1013,7 @@ private function base_class_next_token() { */ $this->token_starts_at = $was_at; $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; - $this->text_starts_at = $tag_ends_at + 1; + $this->text_starts_at = $tag_ends_at; $this->text_length = $this->tag_name_starts_at - $this->text_starts_at; $this->tag_name_starts_at = $tag_name_starts_at; $this->tag_name_length = $tag_name_length; @@ -2687,7 +2687,7 @@ public function has_self_closing_flag() { *
* ^ this appears one character before the end of the closing ">". */ - return '/' === $this->html[ $this->token_starts_at + $this->token_length - 1 ]; + return '/' === $this->html[ $this->token_starts_at + $this->token_length - 2 ]; } /** diff --git a/src/wp-includes/interactivity-api/class-wp-interactivity-api-directives-processor.php b/src/wp-includes/interactivity-api/class-wp-interactivity-api-directives-processor.php index 3b2dcb1237971..b12dcb4b3b158 100644 --- a/src/wp-includes/interactivity-api/class-wp-interactivity-api-directives-processor.php +++ b/src/wp-includes/interactivity-api/class-wp-interactivity-api-directives-processor.php @@ -107,7 +107,7 @@ public function append_content_after_template_tag_closer( string $new_content ): $bookmark = 'append_content_after_template_tag_closer'; $this->set_bookmark( $bookmark ); - $after_closing_tag = $this->bookmarks[ $bookmark ]->start + $this->bookmarks[ $bookmark ]->length + 1; + $after_closing_tag = $this->bookmarks[ $bookmark ]->start + $this->bookmarks[ $bookmark ]->length; $this->release_bookmark( $bookmark ); // Appends the new content. @@ -140,7 +140,7 @@ private function get_after_opener_tag_and_before_closer_tag_positions( bool $rew } list( $opener_tag, $closer_tag ) = $bookmarks; - $after_opener_tag = $this->bookmarks[ $opener_tag ]->start + $this->bookmarks[ $opener_tag ]->length + 1; + $after_opener_tag = $this->bookmarks[ $opener_tag ]->start + $this->bookmarks[ $opener_tag ]->length; $before_closer_tag = $this->bookmarks[ $closer_tag ]->start; if ( $rewind ) { diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php index 824630b33516a..ddebb7d98b4fb 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php @@ -2746,7 +2746,7 @@ public function test_applies_updates_before_proceeding() { public function insert_after( $new_html ) { $this->set_bookmark( 'here' ); $this->lexical_updates[] = new WP_HTML_Text_Replacement( - $this->bookmarks['here']->start + $this->bookmarks['here']->length + 1, + $this->bookmarks['here']->start + $this->bookmarks['here']->length, 0, $new_html );