From d3dd98e7f3e889f28c6e8a31f5ebf5bfcb70f73c Mon Sep 17 00:00:00 2001
From: Dennis Snell
Date: Tue, 15 Jul 2025 11:37:37 -0500
Subject: [PATCH 1/2] HTML API: Reliably parse HTML in `wp_html_split()`.
Trac ticket: Core-63694
This probably improves the performance in terms of both CPU time and
memory compared to the old PCRE-based approach.
---
src/wp-includes/formatting.php | 132 +++++++++++++++++-
tests/phpunit/tests/formatting/wpAutop.php | 11 +-
.../phpunit/tests/formatting/wpHtmlSplit.php | 22 ++-
.../rest-api/rest-comments-controller.php | 40 +++++-
tests/phpunit/tests/shortcode.php | 4 +-
5 files changed, 181 insertions(+), 28 deletions(-)
diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
index f59f877775b77..d77e37c15391b 100644
--- a/src/wp-includes/formatting.php
+++ b/src/wp-includes/formatting.php
@@ -605,27 +605,151 @@ function wpautop( $text, $br = true ) {
}
/**
- * Separates HTML elements and comments from the text.
+ * Splits an HTML input into an array of raw strings, where each token
+ * represents a tag, a comment, a text node, etc…
+ *
+ * No effort is made to clean up, sanitize, or normalize the segments
+ * of HTML. {@see WP_HTML_Processor::normalize()} for normalization.
+ *
+ * Consider using the HTML API directly instead of relying on this
+ * legacy function: it bloats memory by default and provides a text
+ * interface for working with HTML whereas the HTML API provides a
+ * low-overhead and convenient structural interface.
+ *
+ * ## Output format:
+ *
+ * To maintain legacy behaviors with this function from when it
+ * operated via {@see preg_split()}, the output array injects text
+ * nodes which do not appear in the source HTML. That is, the original
+ * array, relying on {@see PREG_SPLIT_DELIM_CAPTURE}, included a text
+ * span on each side of every tag-like or comment-like “delimiter” in
+ * the matched string.
+ *
+ * Therefore, the output array will always start and end with text nodes
+ * and will separate every non-text node with a text node. If there is no
+ * actual content in the interstitial space between tokens in the source
+ * document, an empty text node will be created.
+ *
+ * Example:
+ *
+ * array( '', '
', '' ) === wp_html_split( '
' );
+ * array( 'test' ) === wp_html_split( 'test' );
+ * array( '', '', 'test' ) === wp_html_split( '
test' );
+ * array( 'test', '
', '' ) === wp_html_split( 'test
' );
+ *
+ * array( '', '
', '', '', '' ) === wp_html_split( '
' );
+ *
+ * // To avoid ambiguity, leading less-than signs (<) in text nodes are encoded.
+ * array( '<3' ) === wp_split_html( '<3' );
*
* @since 4.2.4
+ * @since 6.9.0 Reliably parses HTML via the HTML API.
*
- * @param string $input The text which has to be formatted.
- * @return string[] Array of the formatted text.
+ * @param string $input HTML document to split, one item for every token.
+ * These can be text nodes, tags, comments, or doctype declarations.
+ * @return string[] Tokens from input; starting and ending in a text node, and with text
+ * nodes between every non-text node (see docblock note).
*/
function wp_html_split( $input ) {
- return preg_split( get_html_split_regex(), $input, -1, PREG_SPLIT_DELIM_CAPTURE );
+ $token_reporter = new class( $input ) extends WP_HTML_Tag_Processor {
+ public function extract_raw_token() {
+ $this->set_bookmark( 'here' );
+ $here = $this->bookmarks['here'];
+
+ return substr( $this->html, $here->start, $here->length );
+ }
+ };
+
+ $tokens = array();
+ $was_text = false;
+ $next_at = 0;
+ while ( $token_reporter->next_token() ) {
+ $raw_token = $token_reporter->extract_raw_token();
+ $next_at += strlen( $raw_token );
+ $is_text = '#text' === $token_reporter->get_token_name();
+
+ if ( ! $is_text ) {
+ if ( ! $was_text ) {
+ $tokens[] = '';
+ }
+
+ $tokens[] = $raw_token;
+ $was_text = false;
+ continue;
+ }
+
+ /*
+ * WordPress looks for shortcodes and escaped shortcodes within the HTML
+ * where they look like tags but HTML wouldn’t consider them tags, such
+ * as in "<[header level=2]>". Look for these and artificially split the
+ * text nodes where it looks like shortcodes reside inside.
+ */
+ $shortcode_pattern = get_shortcode_regex();
+ $text_chunks = preg_split( "~(<{$shortcode_pattern}>)~", $raw_token, -1, PREG_SPLIT_DELIM_CAPTURE );
+ foreach ( $text_chunks as $i => $token ) {
+ // The preg_split() always puts captured delimiters in the odd indices.
+ $is_shortcode_tag = 0x01 === $i & 0x01;
+
+ if ( $is_shortcode_tag && ! $was_text ) {
+ $tokens[] = '';
+ }
+
+ /*
+ * Some legacy code assumes that text nodes will never start with a
+ * less-than sign (<) but this isn’t the case, as some text nodes do
+ * if the less-than sign doesn’t introduce a syntax token. To avoid
+ * further corruption a leading less-than sign is replaced by its
+ * encoded equivalent numeric character reference.
+ */
+ if ( ! $is_shortcode_tag && '<' === ( $token[0] ?? '' ) ) {
+ $token = '<' . substr( $token, 1 );
+ }
+
+ $was_text = ! $is_shortcode_tag;
+ $tokens[] = $token;
+ }
+ }
+
+ /*
+ * The HTML API aborts when a string ends with the start of a
+ * token which isn’t complete, such as an un-closed comment.
+ * Typically it’s best to avoid processing or passing along
+ * that content because it could impact any HTML which follows
+ * it. However, to maintain backwards compatability this last
+ * segment needs to appear.
+ */
+ if ( $token_reporter->paused_at_incomplete_token() ) {
+ if ( ! $was_text ) {
+ $tokens[] = '';
+ }
+ $was_text = false;
+ $tokens[] = substr( $input, $next_at );
+ }
+
+ if ( ! $was_text ) {
+ $tokens[] = '';
+ }
+
+ return $tokens;
}
/**
* Retrieves the regular expression for an HTML element.
*
* @since 4.4.0
+ * @deprecated 6.9.0 Use the HTML API instead.
*
* @return string The regular expression.
*/
function get_html_split_regex() {
static $regex;
+ _deprecated_function(
+ __FUNCTION__,
+ '6.9.0',
+ 'Use the HTML API instead.'
+ );
+
if ( ! isset( $regex ) ) {
// phpcs:disable Squiz.Strings.ConcatenationSpacing.PaddingFound -- don't remove regex indentation
$comments =
diff --git a/tests/phpunit/tests/formatting/wpAutop.php b/tests/phpunit/tests/formatting/wpAutop.php
index ee4d90645d09c..b97468e7830f8 100644
--- a/tests/phpunit/tests/formatting/wpAutop.php
+++ b/tests/phpunit/tests/formatting/wpAutop.php
@@ -536,9 +536,18 @@ public function data_element_sanity() {
"Hello -->",
"Hello -->
\n",
),
+ /*
+ * CDATA sections do not exist within HTML, so even though it looks
+ * like this should be escaping the entire “inner comment” span, there’s
+ * actually an invalid comment starting at `` character, placing the end of the comment at
+ * what looks like the end of the “normal comment” — the ` -->`. Everything
+ * afterward is normal HTML data so the `
` is a real `BR` element and
+ * the `]]>` is normal plaintext, not the CDATA terminator.
+ */
array(
"Hello a\n9 ]]>",
- "Hello a\n9 ]]>
\n",
+ "Hello a
\n9 ]]>
\n",
),
);
}
diff --git a/tests/phpunit/tests/formatting/wpHtmlSplit.php b/tests/phpunit/tests/formatting/wpHtmlSplit.php
index 750ad3821cc54..ad982059d9b2a 100644
--- a/tests/phpunit/tests/formatting/wpHtmlSplit.php
+++ b/tests/phpunit/tests/formatting/wpHtmlSplit.php
@@ -30,26 +30,20 @@ public function data_basic_features() {
'abcd efgh',
array( 'abcd ', '', ' efgh' ),
),
+ /*
+ * CDATA sections do not exist within HTML, so even though it looks
+ * like this should be escaping the entire ` ` span, there’s
+ * actually an invalid comment starting at `` character, placing the end of the comment at
+ * the end of `html>`. The rest is normal plaintext content.
+ */
array(
'abcd ]]> efgh',
- array( 'abcd ', ' ]]>', ' efgh' ),
+ array( 'abcd ', '', ' ]]> efgh' ),
),
);
}
- /**
- * Automated performance testing of the main regex.
- *
- * @dataProvider data_whole_posts
- *
- * @covers ::get_html_split_regex
- */
- public function test_pcre_performance( $input ) {
- $regex = get_html_split_regex();
- $result = benchmark_pcre_backtracking( $regex, $input, 'split' );
- return $this->assertLessThan( 200, $result );
- }
-
public function data_whole_posts() {
require_once DIR_TESTDATA . '/formatting/whole-posts.php';
return data_whole_posts();
diff --git a/tests/phpunit/tests/rest-api/rest-comments-controller.php b/tests/phpunit/tests/rest-api/rest-comments-controller.php
index 8542bcd42af24..c553019eb28c9 100644
--- a/tests/phpunit/tests/rest-api/rest-comments-controller.php
+++ b/tests/phpunit/tests/rest-api/rest-comments-controller.php
@@ -3152,24 +3152,50 @@ public function test_comment_roundtrip_as_editor_unfiltered_html() {
}
}
+ /**
+ * Ensures that saving a comment as a super-admin does not corrupt the
+ * comment content when presented with common edge cases.
+ *
+ * Note that this test used to assert the wrong behavior due to a bug
+ * in {@see wp_html_split()}. Whereby the unescaped `<` used to be
+ * mistakenly identified as the start of an HTML tag or comment, this
+ * led to accidental replacement “inside” the mistaken tag. The test
+ * has been updated with `wp_html_split()` in accordance with the
+ * HTML5 living specification.
+ *
+ * @ticket {TICKET_NUMBER}
+ */
public function test_comment_roundtrip_as_superadmin() {
wp_set_current_user( self::$superadmin_id );
+ $raw_content = <<<'HTML'
+\\&\\ & &invalid; < < <
+HTML;
+ $rendered = <<<'HTML'
+\\&\\ & &invalid; < < <
+HTML;
+ $author_name = <<<'HTML'
+\\&\\ & &invalid; < < <
+HTML;
+ $author_user_agent = <<<'HTML'
+\\&\\ & &invalid; < < <
+HTML;
+
$this->assertTrue( current_user_can( 'unfiltered_html' ) );
$this->verify_comment_roundtrip(
array(
- 'content' => '\\\&\\\ & &invalid; < < <',
- 'author_name' => '\\\&\\\ & &invalid; < < <',
- 'author_user_agent' => '\\\&\\\ & &invalid; < < <',
+ 'content' => $raw_content,
+ 'author_name' => $raw_content,
+ 'author_user_agent' => $raw_content,
'author' => self::$superadmin_id,
),
array(
'content' => array(
- 'raw' => '\\\&\\\ & &invalid; < < <',
- 'rendered' => '\\\&\\\ & &invalid; < < <' . "\n
",
+ 'raw' => $raw_content,
+ 'rendered' => $rendered,
),
- 'author_name' => '\\\&\\\ & &invalid; < < <',
- 'author_user_agent' => '\\\&\\\ & &invalid; < < <',
+ 'author_name' => $author_name,
+ 'author_user_agent' => $author_user_agent,
'author' => self::$superadmin_id,
)
);
diff --git a/tests/phpunit/tests/shortcode.php b/tests/phpunit/tests/shortcode.php
index 7467d1ed7e6a3..560c31d649a75 100644
--- a/tests/phpunit/tests/shortcode.php
+++ b/tests/phpunit/tests/shortcode.php
@@ -544,7 +544,7 @@ public function test_spaces_around_shortcodes() {
* @dataProvider data_escaping
*/
public function test_escaping( $input, $output ) {
- return $this->assertSame( $output, do_shortcode( $input ) );
+ return $this->assertEqualHTML( $output, do_shortcode( $input ) );
}
public function data_escaping() {
@@ -622,7 +622,7 @@ public function data_escaping() {
* @dataProvider data_escaping2
*/
public function test_escaping2( $input, $output ) {
- return $this->assertSame( $output, strip_shortcodes( $input ) );
+ return $this->assertEqualHTML( $output, strip_shortcodes( $input ) );
}
public function data_escaping2() {
From fb69bf2d04f4e017de43d752d1a83c016589c31a Mon Sep 17 00:00:00 2001
From: Dennis Snell
Date: Mon, 12 Jan 2026 23:39:49 -0700
Subject: [PATCH 2/2] Comment updates and a fix to text node appending in
wp_html_split
---
src/wp-includes/formatting.php | 51 ++++++++++++++++++++++++++++------
1 file changed, 42 insertions(+), 9 deletions(-)
diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
index d77e37c15391b..8b3cee4f17eb5 100644
--- a/src/wp-includes/formatting.php
+++ b/src/wp-includes/formatting.php
@@ -643,7 +643,7 @@ function wpautop( $text, $br = true ) {
* array( '<3' ) === wp_split_html( '<3' );
*
* @since 4.2.4
- * @since 6.9.0 Reliably parses HTML via the HTML API.
+ * @since 7.0.0 Reliably parses HTML via the HTML API.
*
* @param string $input HTML document to split, one item for every token.
* These can be text nodes, tags, comments, or doctype declarations.
@@ -668,7 +668,14 @@ public function extract_raw_token() {
$next_at += strlen( $raw_token );
$is_text = '#text' === $token_reporter->get_token_name();
+ // This is a tag, comment, DOCTYPE declaration, malformed comment, etc…
if ( ! $is_text ) {
+ /*
+ * Code relies on the fact that this function always returns text
+ * in even indices and non-text tokens in odd indices. If there
+ * isn’t preceding text then an artificial and empty span needs
+ * to be added.
+ */
if ( ! $was_text ) {
$tokens[] = '';
}
@@ -681,14 +688,29 @@ public function extract_raw_token() {
/*
* WordPress looks for shortcodes and escaped shortcodes within the HTML
* where they look like tags but HTML wouldn’t consider them tags, such
- * as in "<[header level=2]>". Look for these and artificially split the
- * text nodes where it looks like shortcodes reside inside.
+ * as in "<[header level=2]>".
+ *
+ * This means that something WordPress wants to consider a tag might
+ * appear in the middle of a larger text span. To preserve that behavior
+ * it’s essential to look inside text nodes for these shortcode instances,
+ * and if found, split the string around them.
+ *
+ * Example:
+ *
+ * // HTML sees a single text span here.
+ * "This is <[tag-name]>important!"
+ *
+ * // It needs to break into three segments.
+ * "This is ", "<[tag-name]>", "important!"
+ *
+ * As with the rest of this function, text nodes must appear between these,
+ * implying the creation of empty nodes where they don’t already exist.
*/
$shortcode_pattern = get_shortcode_regex();
$text_chunks = preg_split( "~(<{$shortcode_pattern}>)~", $raw_token, -1, PREG_SPLIT_DELIM_CAPTURE );
foreach ( $text_chunks as $i => $token ) {
// The preg_split() always puts captured delimiters in the odd indices.
- $is_shortcode_tag = 0x01 === $i & 0x01;
+ $is_shortcode_tag = 1 === $i % 2;
if ( $is_shortcode_tag && ! $was_text ) {
$tokens[] = '';
@@ -698,8 +720,13 @@ public function extract_raw_token() {
* Some legacy code assumes that text nodes will never start with a
* less-than sign (<) but this isn’t the case, as some text nodes do
* if the less-than sign doesn’t introduce a syntax token. To avoid
- * further corruption a leading less-than sign is replaced by its
- * encoded equivalent numeric character reference.
+ * further corruption, a leading less-than sign is replaced by its
+ * equivalent numeric character reference.
+ *
+ * Example:
+ *
+ * input: "<3 the shortcodes like <[emoji-tag name=heart]>"
+ * output: "<3 the shortcodes like <[emoji-tag name=heart]>"
*/
if ( ! $is_shortcode_tag && '<' === ( $token[0] ?? '' ) ) {
$token = '<' . substr( $token, 1 );
@@ -719,11 +746,17 @@ public function extract_raw_token() {
* segment needs to appear.
*/
if ( $token_reporter->paused_at_incomplete_token() ) {
- if ( ! $was_text ) {
- $tokens[] = '';
+ $token = substr( $input, $next_at );
+ $syntax_like = '<' === ( $token[0] ?? '' );
+ $token = $syntax_like ? ( '<' . substr( $token, 1 ) ) : $token;
+
+ if ( $was_text ) {
+ $tokens[ count( $tokens ) - 1 ] .= $token;
+ } else {
+ $tokens[] = $token;
}
+
$was_text = false;
- $tokens[] = substr( $input, $next_at );
}
if ( ! $was_text ) {