From 8a0862c3555cb8b0100d18646f678ea84e3f57ad Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 27 May 2024 10:20:26 -0700 Subject: [PATCH 1/9] HTML API: Rewrite wp_html_split relying on the HTML API. --- src/wp-includes/formatting.php | 57 +++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 7c03c484ddb70..51714a0c94995 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -607,13 +607,62 @@ function wpautop( $text, $br = true ) { /** * Separates HTML elements and comments from the text. * + * This function tokenizes an HTML document into its + * components and returns the array of tokens. + * * @since 4.2.4 + * @since 6.6.0 Relies on the HTML API for parsing. * - * @param string $input The text which has to be formatted. - * @return string[] Array of the formatted text. + * @param string $input_html Raw HTML potentially containing a mixture of tags, + * comments, text nodes, and other sytnax. + * @return string[] */ -function wp_html_split( $input ) { - return preg_split( get_html_split_regex(), $input, -1, PREG_SPLIT_DELIM_CAPTURE ); +function wp_html_split( $input_html ) { + $chunks = array(); + $processor = new class ( $input_html ) extends WP_HTML_Tag_Processor { + /** + * Returns the raw token from the input string at the + * current location, if paused at a location. + * + * @return false|string + */ + public function get_raw_token() { + if ( + WP_HTML_Tag_Processor::STATE_READY === $this->parser_state || + WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state || + WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state + ) { + return false; + } + + $this->set_bookmark( 'here' ); + $here = $this->bookmarks['here']; + + return substr( $this->html, $here->start, $here->length ); + } + }; + + while ( $processor->next_token() ) { + $is_special_atomic_element = in_array( + $processor->get_tag(), + array( 'SCRIPT', 'STYLE', 'XMP', 'NOEMBED', 'NOFRAMES', 'TITLE', 'TEXTAREA' ), + true + ); + + // @todo Transfer everything properly. + if ( $is_special_atomic_element ) { + $raw_html = $processor->get_raw_token(); + $tag_name = substr( $raw_html, 1, strlen( $processor->get_tag() ) ); + $chunks[] = "<{$tag_name}>"; + $chunks[] = $processor->get_modifiable_text(); + $chunks[] = ""; + continue; + } + + $chunks[] = $processor->get_raw_token(); + } + + return $chunks; } /** From bc1c877c0d30d9b36638a31b3c6c8d1a84cf97ac Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 24 May 2024 18:35:50 +0200 Subject: [PATCH 2/9] Fix off-by-1 (cherry picked from commit b7b287676d4e114f49a3c58eb3edffaefe7a05cf) --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 4597a888b5efe..f2327042bf464 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2687,7 +2687,7 @@ public function has_self_closing_flag() { *
* ^ this appears one character before the end of the closing ">". */ - return '/' === $this->html[ $this->token_starts_at + $this->token_length - 1 ]; + return '/' === $this->html[ $this->token_starts_at + $this->token_length - 2 ]; } /** From 89d28fa672a2ef001959718f062d8ef6fd56bc38 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 24 May 2024 19:17:12 +0200 Subject: [PATCH 3/9] Fix interactivity (cherry picked from commit b3b4562c41146d9d8cdc93f478ed53a06092b0bf) --- .../class-wp-interactivity-api-directives-processor.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/interactivity-api/class-wp-interactivity-api-directives-processor.php b/src/wp-includes/interactivity-api/class-wp-interactivity-api-directives-processor.php index 3b2dcb1237971..b12dcb4b3b158 100644 --- a/src/wp-includes/interactivity-api/class-wp-interactivity-api-directives-processor.php +++ b/src/wp-includes/interactivity-api/class-wp-interactivity-api-directives-processor.php @@ -107,7 +107,7 @@ public function append_content_after_template_tag_closer( string $new_content ): $bookmark = 'append_content_after_template_tag_closer'; $this->set_bookmark( $bookmark ); - $after_closing_tag = $this->bookmarks[ $bookmark ]->start + $this->bookmarks[ $bookmark ]->length + 1; + $after_closing_tag = $this->bookmarks[ $bookmark ]->start + $this->bookmarks[ $bookmark ]->length; $this->release_bookmark( $bookmark ); // Appends the new content. @@ -140,7 +140,7 @@ private function get_after_opener_tag_and_before_closer_tag_positions( bool $rew } list( $opener_tag, $closer_tag ) = $bookmarks; - $after_opener_tag = $this->bookmarks[ $opener_tag ]->start + $this->bookmarks[ $opener_tag ]->length + 1; + $after_opener_tag = $this->bookmarks[ $opener_tag ]->start + $this->bookmarks[ $opener_tag ]->length; $before_closer_tag = $this->bookmarks[ $closer_tag ]->start; if ( $rewind ) { From 8fa36572ea27a87f9cb8cdc716893200eb8829ef Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 24 May 2024 18:34:17 +0200 Subject: [PATCH 4/9] Remove off-by-1 adjustment (cherry picked from commit 57e737cc4591ed3929521493e19a1de8ff253a8f) --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index f2327042bf464..cbd077e8666a9 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1013,7 +1013,7 @@ private function base_class_next_token() { */ $this->token_starts_at = $was_at; $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; - $this->text_starts_at = $tag_ends_at + 1; + $this->text_starts_at = $tag_ends_at; $this->text_length = $this->tag_name_starts_at - $this->text_starts_at; $this->tag_name_starts_at = $tag_name_starts_at; $this->tag_name_length = $tag_name_length; From 92f2c4889a2de00b7433bfe7caac78ff83427bdf Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 24 May 2024 17:03:51 +0200 Subject: [PATCH 5/9] Fix tag token length error (cherry picked from commit cdf74627a360fb3c6b8b4222c6e98976841ec39c) --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index cbd077e8666a9..26d22c072e48e 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -926,8 +926,8 @@ private function base_class_next_token() { return false; } $this->parser_state = self::STATE_MATCHED_TAG; - $this->token_length = $tag_ends_at - $this->token_starts_at; $this->bytes_already_parsed = $tag_ends_at + 1; + $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; /* * For non-DATA sections which might contain text that looks like HTML tags but From 53b18eea4f592ed4c3666f92d26c656b76fed6d2 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 27 May 2024 11:07:46 -0700 Subject: [PATCH 6/9] Properly split special atomic elements. --- src/wp-includes/formatting.php | 64 ++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 51714a0c94995..dae466f7ed8b0 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -605,21 +605,13 @@ function wpautop( $text, $br = true ) { } /** - * Separates HTML elements and comments from the text. - * - * This function tokenizes an HTML document into its - * components and returns the array of tokens. + * Returns a Tag Processor exposing the raw matched tokens. * - * @since 4.2.4 - * @since 6.6.0 Relies on the HTML API for parsing. - * - * @param string $input_html Raw HTML potentially containing a mixture of tags, - * comments, text nodes, and other sytnax. - * @return string[] + * @param string $html Passed into the Tag Processor. + * @return WP_HTML_Tag_Processor|__anonymous@23567 */ -function wp_html_split( $input_html ) { - $chunks = array(); - $processor = new class ( $input_html ) extends WP_HTML_Tag_Processor { +function wp_get_internal_tag_processor( $html ) { + return new class( $html ) extends WP_HTML_Tag_Processor { /** * Returns the raw token from the input string at the * current location, if paused at a location. @@ -641,6 +633,24 @@ public function get_raw_token() { return substr( $this->html, $here->start, $here->length ); } }; +} + +/** + * Separates HTML elements and comments from the text. + * + * This function tokenizes an HTML document into its + * components and returns the array of tokens. + * + * @since 4.2.4 + * @since 6.6.0 Relies on the HTML API for parsing. + * + * @param string $input_html Raw HTML potentially containing a mixture of tags, + * comments, text nodes, and other sytnax. + * @return string[] + */ +function wp_html_split( $input_html ) { + $chunks = array(); + $processor = wp_get_internal_tag_processor( $input_html ); while ( $processor->next_token() ) { $is_special_atomic_element = in_array( @@ -651,11 +661,29 @@ public function get_raw_token() { // @todo Transfer everything properly. if ( $is_special_atomic_element ) { - $raw_html = $processor->get_raw_token(); - $tag_name = substr( $raw_html, 1, strlen( $processor->get_tag() ) ); - $chunks[] = "<{$tag_name}>"; - $chunks[] = $processor->get_modifiable_text(); - $chunks[] = ""; + $raw_html = $processor->get_raw_token(); + $tag_name = substr( $raw_html, 1, strlen( $processor->get_tag() ) ); + $modified = "<{$tag_name}" . substr( $raw_html, strlen( $tag_name ) + 1 ); + $modified[1] = 'X'; + + $special = wp_get_internal_tag_processor( $modified ); + + // The first tag is the modified tag. + $special->next_tag(); + $opening_tag = $special->get_raw_token(); + $opening_tag[1] = $tag_name[0]; + $chunks[] = $opening_tag; + + $special->set_bookmark( 'last' ); + while ( $special->next_tag( array( 'tag_closers' => 'visit' ) ) ) { + $special->set_bookmark( 'last' ); + } + $special->seek( 'last' ); + $closing_tag = $special->get_raw_token(); + + $chunks[] = substr( $raw_html, strlen( $opening_tag ), -strlen( $closing_tag ) ); + $chunks[] = $closing_tag; + continue; } From 76708869c2bec5183a361d7c228df018cbce8392 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 27 May 2024 13:57:15 -0700 Subject: [PATCH 7/9] Rerarrange cod --- src/wp-includes/formatting.php | 59 +++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index dae466f7ed8b0..3c308c93bc2f8 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -607,6 +607,8 @@ function wpautop( $text, $br = true ) { /** * Returns a Tag Processor exposing the raw matched tokens. * + * @since 6.6.0 + * * @param string $html Passed into the Tag Processor. * @return WP_HTML_Tag_Processor|__anonymous@23567 */ @@ -659,35 +661,46 @@ function wp_html_split( $input_html ) { true ); - // @todo Transfer everything properly. - if ( $is_special_atomic_element ) { - $raw_html = $processor->get_raw_token(); - $tag_name = substr( $raw_html, 1, strlen( $processor->get_tag() ) ); - $modified = "<{$tag_name}" . substr( $raw_html, strlen( $tag_name ) + 1 ); - $modified[1] = 'X'; - - $special = wp_get_internal_tag_processor( $modified ); + if ( ! $is_special_atomic_element ) { + $chunks[] = $processor->get_raw_token(); + continue; + } - // The first tag is the modified tag. - $special->next_tag(); - $opening_tag = $special->get_raw_token(); - $opening_tag[1] = $tag_name[0]; - $chunks[] = $opening_tag; + /* + * For special atomic tags, it's necessary to redo some work to find + * the opening and closing tag, because the Tag Processor consumes + * them all in one go. + * + * By replacing the first character of the tag name, it's possible to + * trick the Tag Processor into thinking it's non-special content, and + * then get the starting and ending tags, then restore the tag name at + * the end. + * + * Because the end tag for these special atomic elements are matched + * if they are unexpected, then the final closing tag will be found + * after renaming the opening. + */ - $special->set_bookmark( 'last' ); - while ( $special->next_tag( array( 'tag_closers' => 'visit' ) ) ) { - $special->set_bookmark( 'last' ); - } - $special->seek( 'last' ); - $closing_tag = $special->get_raw_token(); + $raw_html = $processor->get_raw_token(); + $first_char = $raw_html[1]; + $raw_html[1] = 'X'; + $special = wp_get_internal_tag_processor( $raw_html ); - $chunks[] = substr( $raw_html, strlen( $opening_tag ), -strlen( $closing_tag ) ); - $chunks[] = $closing_tag; + // The first tag is the modified tag. + $special->next_tag(); + $opening_tag = $special->get_raw_token(); + $opening_tag[1] = $first_char; + $chunks[] = $opening_tag; - continue; + $special->set_bookmark( 'last' ); + while ( $special->next_tag( array( 'tag_closers' => 'visit' ) ) ) { + $special->set_bookmark( 'last' ); } + $special->seek( 'last' ); + $closing_tag = $special->get_raw_token(); - $chunks[] = $processor->get_raw_token(); + $chunks[] = substr( $raw_html, strlen( $opening_tag ), -strlen( $closing_tag ) ); + $chunks[] = $closing_tag; } return $chunks; From baefd6e52f1436e5618f773bb1722bf6d098729d Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 27 May 2024 16:00:31 -0700 Subject: [PATCH 8/9] Add text nodes where missing, to preserve legacy behavior. --- src/wp-includes/formatting.php | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 3c308c93bc2f8..f3ee41537cd9e 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -655,6 +655,16 @@ function wp_html_split( $input_html ) { $processor = wp_get_internal_tag_processor( $input_html ); while ( $processor->next_token() ) { + /* + * There's a legacy behavior where text nodes are always stored in even + * indices and "elements" are stored in odd indices. To preserve this, + * empty text nodes are inserted when there's none between other syntax + * tokens. + */ + if ( 0 === count( $chunks ) % 2 && '#text' !== $processor->get_token_name() ) { + $chunks[] = ''; + } + $is_special_atomic_element = in_array( $processor->get_tag(), array( 'SCRIPT', 'STYLE', 'XMP', 'NOEMBED', 'NOFRAMES', 'TITLE', 'TEXTAREA' ), From db9c49452848bc4962593428c1265f32926ac92e Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 24 May 2024 18:33:06 +0200 Subject: [PATCH 9/9] Remove off-by-1 adjustment --- tests/phpunit/tests/html-api/wpHtmlTagProcessor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php index 824630b33516a..ddebb7d98b4fb 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php @@ -2746,7 +2746,7 @@ public function test_applies_updates_before_proceeding() { public function insert_after( $new_html ) { $this->set_bookmark( 'here' ); $this->lexical_updates[] = new WP_HTML_Text_Replacement( - $this->bookmarks['here']->start + $this->bookmarks['here']->length + 1, + $this->bookmarks['here']->start + $this->bookmarks['here']->length, 0, $new_html );