From 6b7a125cf31fd425871ad871075f93d1dba6c003 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 15 Jul 2025 11:37:37 -0500 Subject: [PATCH 01/10] HTML API: Reliably parse HTML in `wp_html_split()`. Trac ticket: Core-63694 This probably improves the performance in terms of both CPU time and memory compared to the old PCRE-based approach. --- src/wp-includes/formatting.php | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index bd2d349fa20c1..13e2c0c3b370c 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -605,21 +605,41 @@ function wpautop( $text, $br = true ) { } /** - * Separates HTML elements and comments from the text. + * Splits an HTML input into an array of raw strings, where each token + * represents a tag, a comment, a text node, etc… + * + * No effort is made to clean up, sanitize, or normalize the segments + * of HTML. {@see WP_HTML_Processor::normalize()} for normalization. * * @since 4.2.4 + * @since {WP_VERSION} Reliably parses HTML via the HTML API. * * @param string $input The text which has to be formatted. * @return string[] Array of the formatted text. */ function wp_html_split( $input ) { - return preg_split( get_html_split_regex(), $input, -1, PREG_SPLIT_DELIM_CAPTURE ); + $token_reporter = new class( $input ) extends WP_HTML_Tag_Processor { + public function extract_raw_token() { + $this->set_bookmark( 'here' ); + $here = $this->bookmarks['here']; + + return substr( $this->html, $here->start, $here->length ); + } + }; + + $tokens = array(); + while ( $token_reporter->next_token() ) { + $tokens[] = $token_reporter->extract_raw_token(); + } + + return $tokens; } /** * Retrieves the regular expression for an HTML element. * * @since 4.4.0 + * @deprecated {WP_VERSION} Use the HTML API instead. * * @return string The regular expression. */ From d50fd255136a751349cc03594cdecfbf52186ca6 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 11 Sep 2025 17:18:58 -0500 Subject: [PATCH 02/10] Deprecation notice for html split and version --- src/wp-includes/formatting.php | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 13e2c0c3b370c..de14253716402 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -639,13 +639,19 @@ public function extract_raw_token() { * Retrieves the regular expression for an HTML element. * * @since 4.4.0 - * @deprecated {WP_VERSION} Use the HTML API instead. + * @deprecated 6.9.0 Use the HTML API instead. * * @return string The regular expression. */ function get_html_split_regex() { static $regex; + _deprecated_function( + __FUNCTION__, + '6.9.0', + 'Use the HTML API instead.' + ); + if ( ! isset( $regex ) ) { // phpcs:disable Squiz.Strings.ConcatenationSpacing.PaddingFound -- don't remove regex indentation $comments = From 1eb9f574fd50f47707259828f6ff68dea9d966dc Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 15 Jul 2025 15:49:45 -0500 Subject: [PATCH 03/10] Fix two broken tests on CDATA behavior --- tests/phpunit/tests/formatting/wpAutop.php | 2 +- tests/phpunit/tests/formatting/wpHtmlSplit.php | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/phpunit/tests/formatting/wpAutop.php b/tests/phpunit/tests/formatting/wpAutop.php index ee4d90645d09c..cd868bf7ffd2c 100644 --- a/tests/phpunit/tests/formatting/wpAutop.php +++ b/tests/phpunit/tests/formatting/wpAutop.php @@ -538,7 +538,7 @@ public function data_element_sanity() { ), array( "Hello a\n9 ]]>", - "

Hello a\n9 ]]>

\n", + "

Hello a
\n9 ]]>

\n", ), ); } diff --git a/tests/phpunit/tests/formatting/wpHtmlSplit.php b/tests/phpunit/tests/formatting/wpHtmlSplit.php index 750ad3821cc54..f7d922d114f6b 100644 --- a/tests/phpunit/tests/formatting/wpHtmlSplit.php +++ b/tests/phpunit/tests/formatting/wpHtmlSplit.php @@ -32,7 +32,7 @@ public function data_basic_features() { ), array( 'abcd ]]> efgh', - array( 'abcd ', ' ]]>', ' efgh' ), + array( 'abcd ', '', ' ]]> efgh' ), ), ); } From a8ae28f0faba9c22189bcb3996dc899326149353 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 11 Sep 2025 17:22:05 -0500 Subject: [PATCH 04/10] Explain tests that were previously asserting invalid behaviors. (formatting) --- tests/phpunit/tests/formatting/wpAutop.php | 9 +++++++++ tests/phpunit/tests/formatting/wpHtmlSplit.php | 7 +++++++ 2 files changed, 16 insertions(+) diff --git a/tests/phpunit/tests/formatting/wpAutop.php b/tests/phpunit/tests/formatting/wpAutop.php index cd868bf7ffd2c..b97468e7830f8 100644 --- a/tests/phpunit/tests/formatting/wpAutop.php +++ b/tests/phpunit/tests/formatting/wpAutop.php @@ -536,6 +536,15 @@ public function data_element_sanity() { "Hello -->", "

Hello -->

\n", ), + /* + * CDATA sections do not exist within HTML, so even though it looks + * like this should be escaping the entire “inner comment” span, there’s + * actually an invalid comment starting at `` character, placing the end of the comment at + * what looks like the end of the “normal comment” — the ` -->`. Everything + * afterward is normal HTML data so the `
` is a real `BR` element and + * the `]]>` is normal plaintext, not the CDATA terminator. + */ array( "Hello a\n9 ]]>", "

Hello a
\n9 ]]>

\n", diff --git a/tests/phpunit/tests/formatting/wpHtmlSplit.php b/tests/phpunit/tests/formatting/wpHtmlSplit.php index f7d922d114f6b..048dedfd081a7 100644 --- a/tests/phpunit/tests/formatting/wpHtmlSplit.php +++ b/tests/phpunit/tests/formatting/wpHtmlSplit.php @@ -30,6 +30,13 @@ public function data_basic_features() { 'abcd efgh', array( 'abcd ', '', ' efgh' ), ), + /* + * CDATA sections do not exist within HTML, so even though it looks + * like this should be escaping the entire ` ` span, there’s + * actually an invalid comment starting at `` character, placing the end of the comment at + * the end of `html>`. The rest is normal plaintext content. + */ array( 'abcd ]]> efgh', array( 'abcd ', '', ' ]]> efgh' ), From d6e05851ccd97694e427ee3755fe6422f88d59c0 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 11 Sep 2025 17:45:25 -0500 Subject: [PATCH 05/10] HTML Split: Match legacy behavior from preg_split --- src/wp-includes/formatting.php | 67 +++++++++++++++++++++++++++++++--- 1 file changed, 62 insertions(+), 5 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index de14253716402..608f66fcf03a9 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -611,11 +611,44 @@ function wpautop( $text, $br = true ) { * No effort is made to clean up, sanitize, or normalize the segments * of HTML. {@see WP_HTML_Processor::normalize()} for normalization. * + * Consider using the HTML API directly instead of relying on this + * legacy function: it bloats memory by default and provides a text + * interface for working with HTML whereas the HTML API provides a + * low-overhead and convenient structural interface. + * + * ## Output format: + * + * To maintain legacy behaviors with this function from when it + * operated via {@see preg_split()}, the output array injects text + * nodes which do not appear in the source HTML. That is, the original + * array, relying on {@see PREG_SPLIT_DELIM_CAPTURE}, included a text + * span on each side of every tag-like or comment-like “delimiter” in + * the matched string. + * + * Therefore, the output array will always start and end with text nodes + * and will separate every non-text node with a text node. If there is no + * actual content in the interstitial space between tokens in the source + * document, an empty text node will be created. + * + * Example: + * + * array( '', '', '' ) === wp_html_split( '' ); + * array( 'test' ) === wp_html_split( 'test' ); + * array( '', '

', 'test' ) === wp_html_split( '

test' ); + * array( 'test', '

', '' ) === wp_html_split( 'test

' ); + * + * array( '', '
', '', '', '' ) === wp_html_split( '
' ); + * + * // To avoid ambiguity, leading less-than signs (<) in text nodes are encoded. + * array( '<3' ) === wp_split_html( '<3' ); + * * @since 4.2.4 - * @since {WP_VERSION} Reliably parses HTML via the HTML API. + * @since 6.9.0 Reliably parses HTML via the HTML API. * - * @param string $input The text which has to be formatted. - * @return string[] Array of the formatted text. + * @param string $input HTML document to split, one item for every token. + * These can be text nodes, tags, comments, or doctype declarations. + * @return string[] Tokens from input; starting and ending in a text node, and with text + * nodes between every non-text node (see docblock note). */ function wp_html_split( $input ) { $token_reporter = new class( $input ) extends WP_HTML_Tag_Processor { @@ -627,9 +660,33 @@ public function extract_raw_token() { } }; - $tokens = array(); + $tokens = array(); + $was_text = false; while ( $token_reporter->next_token() ) { - $tokens[] = $token_reporter->extract_raw_token(); + $raw_token = $token_reporter->extract_raw_token(); + $is_text = '#text' === $token_reporter->get_token_name(); + + if ( ! $is_text && ! $was_text ) { + $tokens[] = ''; + } + + /* + * Some legacy code assumes that text nodes will never start with a + * less-than sign (<) but this isn’t the case, as some text nodes do + * if the less-than sign doesn’t introduce a syntax token. To avoid + * further corruption a leading less-than sign is replaced by its + * encoded equivalent numeric character reference. + */ + if ( $is_text && '<' === ( $raw_token[0] ?? '' ) ) { + $raw_token = '<' . substr( $raw_token, 1 ); + } + + $tokens[] = $raw_token; + $was_text = $is_text; + } + + if ( ! $was_text ) { + $tokens[] = ''; } return $tokens; From d091ce5da3e98e5687f92b6194ab66d7d935332e Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 15 Jul 2025 15:50:03 -0500 Subject: [PATCH 06/10] Fix broken comments controller test: wp_replace_in_html_tags() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Was detecting a non-escaped `<` as the start of an “element” and then replaced a newline in the text as `` since it thought it was replacing inside a tag. In the end that translated into a raw `\n` again in the end. --- .../rest-api/rest-comments-controller.php | 40 +++++++++++++++---- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/tests/phpunit/tests/rest-api/rest-comments-controller.php b/tests/phpunit/tests/rest-api/rest-comments-controller.php index 8542bcd42af24..c553019eb28c9 100644 --- a/tests/phpunit/tests/rest-api/rest-comments-controller.php +++ b/tests/phpunit/tests/rest-api/rest-comments-controller.php @@ -3152,24 +3152,50 @@ public function test_comment_roundtrip_as_editor_unfiltered_html() { } } + /** + * Ensures that saving a comment as a super-admin does not corrupt the + * comment content when presented with common edge cases. + * + * Note that this test used to assert the wrong behavior due to a bug + * in {@see wp_html_split()}. Whereby the unescaped `<` used to be + * mistakenly identified as the start of an HTML tag or comment, this + * led to accidental replacement “inside” the mistaken tag. The test + * has been updated with `wp_html_split()` in accordance with the + * HTML5 living specification. + * + * @ticket {TICKET_NUMBER} + */ public function test_comment_roundtrip_as_superadmin() { wp_set_current_user( self::$superadmin_id ); + $raw_content = <<<'HTML' +\\&\\ & &invalid; < < &lt; +HTML; + $rendered = <<<'HTML' +

\\&\\ & &invalid; < < &lt;

+HTML; + $author_name = <<<'HTML' +\\&\\ & &invalid; < < &lt; +HTML; + $author_user_agent = <<<'HTML' +\\&\\ & &invalid; < < &lt; +HTML; + $this->assertTrue( current_user_can( 'unfiltered_html' ) ); $this->verify_comment_roundtrip( array( - 'content' => '\\\&\\\ & &invalid; < < &lt;', - 'author_name' => '\\\&\\\ & &invalid; < < &lt;', - 'author_user_agent' => '\\\&\\\ & &invalid; < < &lt;', + 'content' => $raw_content, + 'author_name' => $raw_content, + 'author_user_agent' => $raw_content, 'author' => self::$superadmin_id, ), array( 'content' => array( - 'raw' => '\\\&\\\ & &invalid; < < &lt;', - 'rendered' => '

\\\&\\\ & &invalid; < < &lt;' . "\n

", + 'raw' => $raw_content, + 'rendered' => $rendered, ), - 'author_name' => '\\\&\\\ & &invalid; < < &lt;', - 'author_user_agent' => '\\\&\\\ & &invalid; < < &lt;', + 'author_name' => $author_name, + 'author_user_agent' => $author_user_agent, 'author' => self::$superadmin_id, ) ); From d5b789c4793e659161eb7707cf973046ca372697 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 11 Sep 2025 18:00:07 -0500 Subject: [PATCH 07/10] Fixup! Tests --- tests/phpunit/tests/formatting/wpHtmlSplit.php | 13 ------------- tests/phpunit/tests/shortcode.php | 2 +- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/tests/phpunit/tests/formatting/wpHtmlSplit.php b/tests/phpunit/tests/formatting/wpHtmlSplit.php index 048dedfd081a7..ad982059d9b2a 100644 --- a/tests/phpunit/tests/formatting/wpHtmlSplit.php +++ b/tests/phpunit/tests/formatting/wpHtmlSplit.php @@ -44,19 +44,6 @@ public function data_basic_features() { ); } - /** - * Automated performance testing of the main regex. - * - * @dataProvider data_whole_posts - * - * @covers ::get_html_split_regex - */ - public function test_pcre_performance( $input ) { - $regex = get_html_split_regex(); - $result = benchmark_pcre_backtracking( $regex, $input, 'split' ); - return $this->assertLessThan( 200, $result ); - } - public function data_whole_posts() { require_once DIR_TESTDATA . '/formatting/whole-posts.php'; return data_whole_posts(); diff --git a/tests/phpunit/tests/shortcode.php b/tests/phpunit/tests/shortcode.php index 269da2b05e34d..470ddb527ce0a 100644 --- a/tests/phpunit/tests/shortcode.php +++ b/tests/phpunit/tests/shortcode.php @@ -622,7 +622,7 @@ public function data_escaping() { * @dataProvider data_escaping2 */ public function test_escaping2( $input, $output ) { - return $this->assertSame( $output, strip_shortcodes( $input ) ); + return $this->assertEqualHTML( $output, strip_shortcodes( $input ) ); } public function data_escaping2() { From 556b3086044cd464cf8dea765fd1136274e31d86 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 11 Sep 2025 20:26:34 -0500 Subject: [PATCH 08/10] Process incomplete HTML tokens as raw text anyway --- src/wp-includes/formatting.php | 18 ++++++++++++++++++ tests/phpunit/tests/shortcode.php | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 608f66fcf03a9..228257c591689 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -662,8 +662,10 @@ public function extract_raw_token() { $tokens = array(); $was_text = false; + $next_at = 0; while ( $token_reporter->next_token() ) { $raw_token = $token_reporter->extract_raw_token(); + $next_at += strlen( $raw_token ); $is_text = '#text' === $token_reporter->get_token_name(); if ( ! $is_text && ! $was_text ) { @@ -685,6 +687,22 @@ public function extract_raw_token() { $was_text = $is_text; } + /* + * The HTML API aborts when a string ends with the start of a + * token which isn’t complete, such as an un-closed comment. + * Typically it’s best to avoid processing or passing along + * that content because it could impact any HTML which follows + * it. However, to maintain backwards compatability this last + * segment needs to appear. + */ + if ( $token_reporter->paused_at_incomplete_token() ) { + if ( ! $was_text ) { + $tokens[] = ''; + } + $was_text = false; + $tokens[] = substr( $input, $next_at ); + } + if ( ! $was_text ) { $tokens[] = ''; } diff --git a/tests/phpunit/tests/shortcode.php b/tests/phpunit/tests/shortcode.php index 470ddb527ce0a..d78fa98f07aaa 100644 --- a/tests/phpunit/tests/shortcode.php +++ b/tests/phpunit/tests/shortcode.php @@ -544,7 +544,7 @@ public function test_spaces_around_shortcodes() { * @dataProvider data_escaping */ public function test_escaping( $input, $output ) { - return $this->assertSame( $output, do_shortcode( $input ) ); + return $this->assertEqualHTML( $output, do_shortcode( $input ) ); } public function data_escaping() { From a8bd5321818991f4dd5f802c2863c015e8f0c7ba Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 21 Oct 2025 02:04:02 -0700 Subject: [PATCH 09/10] wp_html_split() match legacy shortcode-tags --- src/wp-includes/formatting.php | 47 +++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 228257c591689..75ecf0adba18b 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -668,23 +668,46 @@ public function extract_raw_token() { $next_at += strlen( $raw_token ); $is_text = '#text' === $token_reporter->get_token_name(); - if ( ! $is_text && ! $was_text ) { - $tokens[] = ''; + if ( ! $is_text ) { + if ( ! $was_text ) { + $tokens[] = ''; + } + + $tokens[] = $raw_token; + $was_text = false; + continue; } /* - * Some legacy code assumes that text nodes will never start with a - * less-than sign (<) but this isn’t the case, as some text nodes do - * if the less-than sign doesn’t introduce a syntax token. To avoid - * further corruption a leading less-than sign is replaced by its - * encoded equivalent numeric character reference. + * WordPress looks for shortcodes and escaped shortcodes within the HTML + * where they look like tags but HTML wouldn’t consider them tags, such + * as in "<[header level=2]>". Look for these and artificially split the + * text nodes where it looks like shortcodes reside inside. */ - if ( $is_text && '<' === ( $raw_token[0] ?? '' ) ) { - $raw_token = '<' . substr( $raw_token, 1 ); - } + $shortcode_pattern = get_shortcode_regex(); + $text_chunks = preg_split( "~(<{$shortcode_pattern}>)~", $raw_token, -1, PREG_SPLIT_DELIM_CAPTURE ); + foreach ( $text_chunks as $i => $token ) { + // The preg_split() always puts captured delimiters in the odd indices. + $is_shortcode_tag = 0x01 === $i & 0x01; + + if ( $is_shortcode_tag && ! $was_text ) { + $tokens[] = ''; + } - $tokens[] = $raw_token; - $was_text = $is_text; + /* + * Some legacy code assumes that text nodes will never start with a + * less-than sign (<) but this isn’t the case, as some text nodes do + * if the less-than sign doesn’t introduce a syntax token. To avoid + * further corruption a leading less-than sign is replaced by its + * encoded equivalent numeric character reference. + */ + if ( ! $is_shortcode_tag && '<' === ( $token[0] ?? '' ) ) { + $token = '<' . substr( $token, 1 ); + } + + $was_text = ! $is_shortcode_tag; + $tokens[] = $token; + } } /* From 638e6659eb3e8605aeb15498acebb1d7775f105b Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 15 Jul 2025 14:28:54 -0500 Subject: [PATCH 10/10] Refactor `wp_replace_in_html_tags()` --- src/wp-includes/formatting.php | 81 +++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 22 deletions(-) diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index 75ecf0adba18b..36c1f3f2a9296 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -872,16 +872,44 @@ function _get_wptexturize_shortcode_regex( $tagnames ) { /** * Replaces characters or phrases within HTML elements only. * + * This is a dangerous function which can break HTML syntax, + * consider using methods from the HTML API instead. + * + * Example: + * + * '

data-class

' === wp_replace_in_html_tags( + * '

data-class

', + * array( 'data-class' => 'class' ) + * ); + * * @since 4.2.3 + * @since {WP_VERSION} Reliably parses HTML via the HTML API. * - * @param string $haystack The text which has to be formatted. + * @param string $html Replace matches inside the tags of this HTML. * @param array $replace_pairs In the form array('from' => 'to', ...). - * @return string The formatted text. + * @return string HTML after replacing the `$replace_pairs` matches, but only those + * matches which appear inside HTML opening and closing tags. */ -function wp_replace_in_html_tags( $haystack, $replace_pairs ) { - // Find all elements. - $textarr = wp_html_split( $haystack ); - $changed = false; +function wp_replace_in_html_tags( $html, $replace_pairs ) { + $token_updater = new class( $html ) extends WP_HTML_Tag_Processor { + public function extract_raw_token() { + $this->set_bookmark( 'here' ); + $here = $this->bookmarks['here']; + + return substr( $this->html, $here->start, $here->length ); + } + + public function replace_raw_token( $new_raw_html ) { + $this->set_bookmark( 'here' ); + $here = $this->bookmarks['here']; + + $this->lexical_updates[] = new WP_HTML_Text_Replacement( + $here->start, + $here->length, + $new_raw_html + ); + } + }; // Optimize when searching for one item. if ( 1 === count( $replace_pairs ) ) { @@ -889,35 +917,44 @@ function wp_replace_in_html_tags( $haystack, $replace_pairs ) { $needle = array_key_first( $replace_pairs ); $replace = $replace_pairs[ $needle ]; - // Loop through delimiters (elements) only. - for ( $i = 1, $c = count( $textarr ); $i < $c; $i += 2 ) { - if ( str_contains( $textarr[ $i ], $needle ) ) { - $textarr[ $i ] = str_replace( $needle, $replace, $textarr[ $i ] ); - $changed = true; + while ( $token_updater->next_token() ) { + if ( '#text' === $token_updater->get_token_name() ) { + continue; + } + + $token = $token_updater->extract_raw_token(); + $updated = str_replace( $needle, $replace, $token ); + + if ( $token !== $updated ) { + $token_updater->replace_raw_token( $updated ); } } } else { // Extract all $needles. $needles = array_keys( $replace_pairs ); - // Loop through delimiters (elements) only. - for ( $i = 1, $c = count( $textarr ); $i < $c; $i += 2 ) { + while ( $token_updater->next_token() ) { + if ( '#text' === $token_updater->get_token_name() ) { + continue; + } + + $token = $token_updater->extract_raw_token(); + $updated = $token; + foreach ( $needles as $needle ) { - if ( str_contains( $textarr[ $i ], $needle ) ) { - $textarr[ $i ] = strtr( $textarr[ $i ], $replace_pairs ); - $changed = true; - // After one strtr() break out of the foreach loop and look at next element. + if ( str_contains( $token, $needle ) ) { + $updated = strtr( $updated, $replace_pairs ); break; } } - } - } - if ( $changed ) { - $haystack = implode( $textarr ); + if ( $token !== $updated ) { + $token_updater->replace_raw_token( $updated ); + } + } } - return $haystack; + return $token_updater->get_updated_html(); } /**